github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/acceptance/localcluster/cluster.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package localcluster
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	gosql "database/sql"
    17  	"fmt"
    18  	"go/build"
    19  	"io"
    20  	"io/ioutil"
    21  	"math/rand"
    22  	"net"
    23  	"net/url"
    24  	"os"
    25  	"os/exec"
    26  	"path/filepath"
    27  	"sort"
    28  	"strings"
    29  	"sync/atomic"
    30  	"text/tabwriter"
    31  	"time"
    32  
    33  	"github.com/cockroachdb/cockroach/pkg/base"
    34  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    35  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    36  	"github.com/cockroachdb/cockroach/pkg/rpc"
    37  	"github.com/cockroachdb/cockroach/pkg/security"
    38  	"github.com/cockroachdb/cockroach/pkg/server/serverpb"
    39  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    40  	"github.com/cockroachdb/cockroach/pkg/testutils"
    41  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    42  	"github.com/cockroachdb/cockroach/pkg/util/log"
    43  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    44  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    45  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    46  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    47  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    48  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    49  	"github.com/cockroachdb/errors"
    50  	"github.com/gogo/protobuf/proto"
    51  	// Import postgres driver.
    52  	_ "github.com/lib/pq"
    53  )
    54  
    55  func repoRoot() string {
    56  	root, err := build.Import("github.com/cockroachdb/cockroach", "", build.FindOnly)
    57  	if err != nil {
    58  		panic(fmt.Sprintf("must run from within the cockroach repository: %s", err))
    59  	}
    60  	return root.Dir
    61  }
    62  
    63  // SourceBinary returns the path of the cockroach binary that was built with the
    64  // local source.
    65  func SourceBinary() string {
    66  	return filepath.Join(repoRoot(), "cockroach")
    67  }
    68  
    69  const listeningURLFile = "cockroachdb-url"
    70  
    71  // IsUnavailableError returns true iff the error corresponds to a GRPC
    72  // connection unavailable error.
    73  func IsUnavailableError(err error) bool {
    74  	return strings.Contains(err.Error(), "grpc: the connection is unavailable")
    75  }
    76  
    77  // A ClusterConfig holds the configuration for a Cluster.
    78  type ClusterConfig struct {
    79  	Ephemeral   bool               // when true, wipe DataDir on Close()
    80  	Binary      string             // path to cockroach, defaults go <cockroach_repo>/cockroach
    81  	AllNodeArgs []string           // args to pass to ./cockroach on all nodes
    82  	NumNodes    int                // number of nodes in the cluster
    83  	DataDir     string             // node i will use storage DataDir/<i>
    84  	LogDir      string             // when empty, node i defaults to DataDir/<i>/logs
    85  	PerNodeCfg  map[int]NodeConfig // optional map of nodeIndex -> configuration
    86  	DB          string             // database to configure DB connection for
    87  	NumWorkers  int                // SetMaxOpenConns to use for DB connection
    88  	NoWait      bool               // if set, return from Start before cluster ready
    89  }
    90  
    91  // NodeConfig is a configuration for a node in a Cluster. Options with the zero
    92  // value are typically populated from the corresponding Cluster's ClusterConfig.
    93  type NodeConfig struct {
    94  	Binary            string   // when specified, overrides the node's binary
    95  	DataDir           string   // when specified, overrides the node's data dir
    96  	LogDir            string   // when specified, overrides the node's log dir
    97  	Addr              string   // listening host, defaults to 127.0.0.1
    98  	ExtraArgs         []string // extra arguments for ./cockroach start
    99  	ExtraEnv          []string // environment variables in format key=value
   100  	RPCPort, HTTPPort int      // zero for auto-assign
   101  	DB                string   // see ClusterConfig
   102  	NumWorkers        int      // see ClusterConfig
   103  }
   104  
   105  // MakePerNodeFixedPortsCfg makes a PerNodeCfg map of the given number of nodes
   106  // with odd ports starting at 26257 for the RPC endpoint, and even points for
   107  // the ui.
   108  func MakePerNodeFixedPortsCfg(numNodes int) map[int]NodeConfig {
   109  	perNodeCfg := make(map[int]NodeConfig)
   110  
   111  	for i := 0; i < numNodes; i++ {
   112  		perNodeCfg[i] = NodeConfig{
   113  			RPCPort:  26257 + 2*i,
   114  			HTTPPort: 26258 + 2*i,
   115  		}
   116  	}
   117  
   118  	return perNodeCfg
   119  }
   120  
   121  // Cluster holds the state for a local cluster, providing methods for common
   122  // operations, access to the underlying nodes and per-node KV and SQL clients.
   123  type Cluster struct {
   124  	Cfg     ClusterConfig
   125  	seq     *seqGen
   126  	Nodes   []*Node
   127  	stopper *stop.Stopper
   128  	started time.Time
   129  }
   130  
   131  type seqGen int32
   132  
   133  func (s *seqGen) Next() int32 {
   134  	return atomic.AddInt32((*int32)(s), 1)
   135  }
   136  
   137  // New creates a Cluster with the given configuration.
   138  func New(cfg ClusterConfig) *Cluster {
   139  	if cfg.Binary == "" {
   140  		cfg.Binary = SourceBinary()
   141  	}
   142  	return &Cluster{
   143  		Cfg:     cfg,
   144  		seq:     new(seqGen),
   145  		stopper: stop.NewStopper(),
   146  	}
   147  }
   148  
   149  // Start starts a cluster. The numWorkers parameter controls the SQL connection
   150  // settings to avoid unnecessary connection creation. The allNodeArgs parameter
   151  // can be used to pass extra arguments to every node. The perNodeArgs parameter
   152  // can be used to pass extra arguments to an individual node. If not nil, its
   153  // size must equal the number of nodes.
   154  func (c *Cluster) Start(ctx context.Context) {
   155  	c.started = timeutil.Now()
   156  
   157  	chs := make([]<-chan error, c.Cfg.NumNodes)
   158  	for i := 0; i < c.Cfg.NumNodes; i++ {
   159  		cfg := c.Cfg.PerNodeCfg[i] // zero value is ok
   160  		if cfg.Binary == "" {
   161  			cfg.Binary = c.Cfg.Binary
   162  		}
   163  		if cfg.DataDir == "" {
   164  			cfg.DataDir = filepath.Join(c.Cfg.DataDir, fmt.Sprintf("%d", i+1))
   165  		}
   166  		if cfg.LogDir == "" && c.Cfg.LogDir != "" {
   167  			cfg.LogDir = filepath.Join(c.Cfg.LogDir, fmt.Sprintf("%d", i+1))
   168  		}
   169  		if cfg.Addr == "" {
   170  			cfg.Addr = "127.0.0.1"
   171  		}
   172  		if cfg.DB == "" {
   173  			cfg.DB = c.Cfg.DB
   174  		}
   175  		if cfg.NumWorkers == 0 {
   176  			cfg.NumWorkers = c.Cfg.NumWorkers
   177  		}
   178  		cfg.ExtraArgs = append(append([]string(nil), c.Cfg.AllNodeArgs...), cfg.ExtraArgs...)
   179  		var node *Node
   180  		node, chs[i] = c.makeNode(ctx, i, cfg)
   181  		c.Nodes = append(c.Nodes, node)
   182  		if i == 0 && cfg.RPCPort == 0 && c.Cfg.NumNodes > 1 {
   183  			// The first node must know its RPCPort or we can't possibly tell
   184  			// the other nodes the correct one to go to.
   185  			//
   186  			// Note: we can't set up a cluster first and clone it for each test,
   187  			// because all ports change so the cluster won't come together.
   188  			// Luckily, it takes only ~2 seconds from zero to a replicated 4
   189  			// node cluster.
   190  			if err := <-chs[0]; err != nil {
   191  				log.Fatalf(ctx, "while starting first node: %s", err)
   192  			}
   193  			ch := make(chan error)
   194  			close(ch)
   195  			chs[0] = ch
   196  		}
   197  	}
   198  
   199  	if !c.Cfg.NoWait {
   200  		for i := range chs {
   201  			if err := <-chs[i]; err != nil {
   202  				log.Fatalf(ctx, "node %d: %s", i+1, err)
   203  			}
   204  		}
   205  	}
   206  
   207  	log.Infof(context.Background(), "started %.3fs", timeutil.Since(c.started).Seconds())
   208  
   209  	if c.Cfg.NumNodes > 1 || !c.Cfg.NoWait {
   210  		c.waitForFullReplication()
   211  	} else {
   212  		// NB: This is useful for TestRapidRestarts.
   213  		log.Infof(ctx, "not waiting for initial replication")
   214  	}
   215  }
   216  
   217  // Close stops the cluster, killing all of the nodes.
   218  func (c *Cluster) Close() {
   219  	for _, n := range c.Nodes {
   220  		n.Kill()
   221  	}
   222  	c.stopper.Stop(context.Background())
   223  	if c.Cfg.Ephemeral {
   224  		_ = os.RemoveAll(c.Cfg.DataDir)
   225  	}
   226  }
   227  
   228  func (c *Cluster) joins() []string {
   229  	type addrAndSeq struct {
   230  		addr string
   231  		seq  int32
   232  	}
   233  
   234  	var joins []addrAndSeq
   235  	for _, node := range c.Nodes {
   236  		advertAddr := node.AdvertiseAddr()
   237  		if advertAddr != "" {
   238  			joins = append(joins, addrAndSeq{
   239  				addr: advertAddr,
   240  				seq:  atomic.LoadInt32(&node.startSeq),
   241  			})
   242  		}
   243  	}
   244  	sort.Slice(joins, func(i, j int) bool {
   245  		return joins[i].seq < joins[j].seq
   246  	})
   247  
   248  	if len(joins) == 0 {
   249  		return nil
   250  	}
   251  
   252  	// Return the node with the smallest startSeq, i.e. the node that was
   253  	// started first. This is the node that might have no --join flag set, and
   254  	// we must point the other nodes at it, and *only* at it (or the other nodes
   255  	// may connect sufficiently and never bother to talk to this node).
   256  	return []string{joins[0].addr}
   257  }
   258  
   259  // IPAddr returns the IP address of the specified node.
   260  func (c *Cluster) IPAddr(nodeIdx int) string {
   261  	return c.Nodes[nodeIdx].IPAddr()
   262  }
   263  
   264  // RPCPort returns the RPC port of the specified node. Returns zero if unknown.
   265  func (c *Cluster) RPCPort(nodeIdx int) string {
   266  	return c.Nodes[nodeIdx].RPCPort()
   267  }
   268  
   269  func (c *Cluster) makeNode(ctx context.Context, nodeIdx int, cfg NodeConfig) (*Node, <-chan error) {
   270  	baseCtx := &base.Config{
   271  		User:     security.NodeUser,
   272  		Insecure: true,
   273  	}
   274  	rpcCtx := rpc.NewContext(log.AmbientContext{Tracer: tracing.NewTracer()}, baseCtx,
   275  		hlc.NewClock(hlc.UnixNano, 0), c.stopper, cluster.MakeTestingClusterSettings())
   276  
   277  	n := &Node{
   278  		Cfg:    cfg,
   279  		rpcCtx: rpcCtx,
   280  		seq:    c.seq,
   281  	}
   282  
   283  	args := []string{
   284  		cfg.Binary,
   285  		"start",
   286  		"--insecure",
   287  		// Although --host/--port are deprecated, we cannot yet replace
   288  		// this here by --listen-addr/--listen-port, because
   289  		// TestVersionUpgrade will also try old binaries.
   290  		fmt.Sprintf("--host=%s", n.IPAddr()),
   291  		fmt.Sprintf("--port=%d", cfg.RPCPort),
   292  		fmt.Sprintf("--http-port=%d", cfg.HTTPPort),
   293  		fmt.Sprintf("--store=%s", cfg.DataDir),
   294  		fmt.Sprintf("--listening-url-file=%s", n.listeningURLFile()),
   295  		fmt.Sprintf("--cache=256MiB"),
   296  	}
   297  
   298  	if n.Cfg.LogDir != "" {
   299  		args = append(args, fmt.Sprintf("--log-dir=%s", n.Cfg.LogDir))
   300  	}
   301  
   302  	n.Cfg.ExtraArgs = append(args, cfg.ExtraArgs...)
   303  
   304  	if err := os.MkdirAll(n.logDir(), 0755); err != nil {
   305  		log.Fatalf(context.Background(), "%v", err)
   306  	}
   307  
   308  	joins := c.joins()
   309  	if nodeIdx > 0 && len(joins) == 0 {
   310  		ch := make(chan error, 1)
   311  		ch <- errors.Errorf("node %d started without join flags", nodeIdx+1)
   312  		return nil, ch
   313  	}
   314  	ch := n.StartAsync(ctx, joins...)
   315  	return n, ch
   316  }
   317  
   318  // waitForFullReplication waits for the cluster to be fully replicated.
   319  func (c *Cluster) waitForFullReplication() {
   320  	for i := 1; true; i++ {
   321  		done, detail := c.isReplicated()
   322  		if (done && i >= 50) || (i%50) == 0 {
   323  			fmt.Print(detail)
   324  			log.Infof(context.Background(), "waiting for replication")
   325  		}
   326  		if done {
   327  			break
   328  		}
   329  		time.Sleep(100 * time.Millisecond)
   330  	}
   331  
   332  	log.Infof(context.Background(), "replicated %.3fs", timeutil.Since(c.started).Seconds())
   333  }
   334  
   335  func (c *Cluster) isReplicated() (bool, string) {
   336  	db := c.Nodes[0].DB()
   337  	rows, err := db.Query(`SELECT range_id, start_key, end_key, array_length(replicas, 1) FROM crdb_internal.ranges`)
   338  	if err != nil {
   339  		// Versions <= 1.1 do not contain the crdb_internal table, which is what's used
   340  		// to determine whether a cluster has up-replicated. This is relevant for the
   341  		// version upgrade acceptance test. Just skip the replication check for this case.
   342  		if testutils.IsError(err, "(table|relation) \"crdb_internal.ranges\" does not exist") {
   343  			return true, ""
   344  		}
   345  		log.Fatalf(context.Background(), "%v", err)
   346  	}
   347  	defer rows.Close()
   348  
   349  	var buf bytes.Buffer
   350  	tw := tabwriter.NewWriter(&buf, 2, 1, 2, ' ', 0)
   351  	done := true
   352  	for rows.Next() {
   353  		var rangeID int64
   354  		var startKey, endKey roachpb.Key
   355  		var numReplicas int
   356  		if err := rows.Scan(&rangeID, &startKey, &endKey, &numReplicas); err != nil {
   357  			log.Fatalf(context.Background(), "unable to scan range replicas: %s", err)
   358  		}
   359  		fmt.Fprintf(tw, "\t%s\t%s\t[%d]\t%d\n", startKey, endKey, rangeID, numReplicas)
   360  		// This check is coarse since it doesn't know the real configuration.
   361  		// Assume all is well when there are 3+ replicas, or if there are as
   362  		// many replicas as there are nodes.
   363  		if numReplicas < 3 && numReplicas != len(c.Nodes) {
   364  			done = false
   365  		}
   366  	}
   367  	_ = tw.Flush()
   368  	return done, buf.String()
   369  }
   370  
   371  // UpdateZoneConfig updates the default zone config for the cluster.
   372  func (c *Cluster) UpdateZoneConfig(rangeMinBytes, rangeMaxBytes int64) {
   373  	zone := zonepb.DefaultZoneConfig()
   374  	zone.RangeMinBytes = proto.Int64(rangeMinBytes)
   375  	zone.RangeMaxBytes = proto.Int64(rangeMaxBytes)
   376  
   377  	buf, err := protoutil.Marshal(&zone)
   378  	if err != nil {
   379  		log.Fatalf(context.Background(), "%v", err)
   380  	}
   381  	_, err = c.Nodes[0].DB().Exec(`UPSERT INTO system.zones (id, config) VALUES (0, $1)`, buf)
   382  	if err != nil {
   383  		log.Fatalf(context.Background(), "%v", err)
   384  	}
   385  }
   386  
   387  // Split splits the range containing the split key at the specified split key.
   388  func (c *Cluster) Split(nodeIdx int, splitKey roachpb.Key) error {
   389  	return errors.Errorf("Split is unimplemented and should be re-implemented using SQL")
   390  }
   391  
   392  // TransferLease transfers the lease for the range containing key to a random
   393  // alive node in the range.
   394  func (c *Cluster) TransferLease(nodeIdx int, r *rand.Rand, key roachpb.Key) (bool, error) {
   395  	return false, errors.Errorf("TransferLease is unimplemented and should be re-implemented using SQL")
   396  }
   397  
   398  // RandNode returns the index of a random alive node.
   399  func (c *Cluster) RandNode(f func(int) int) int {
   400  	for {
   401  		i := f(len(c.Nodes))
   402  		if c.Nodes[i].Alive() {
   403  			return i
   404  		}
   405  	}
   406  }
   407  
   408  // Node holds the state for a single node in a local cluster and provides
   409  // methods for starting, pausing, resuming and stopping the node.
   410  type Node struct {
   411  	Cfg    NodeConfig
   412  	rpcCtx *rpc.Context
   413  	seq    *seqGen
   414  
   415  	startSeq int32        // updated atomically on start, nonzero while running
   416  	waitErr  atomic.Value // last `error`` returned from cmd.Wait()
   417  
   418  	syncutil.Mutex
   419  	notRunning     chan struct{}
   420  	cmd            *exec.Cmd
   421  	rpcPort, pgURL string // legacy: remove once 1.0.x is no longer tested
   422  	db             *gosql.DB
   423  	statusClient   serverpb.StatusClient
   424  }
   425  
   426  // RPCPort returns the RPC + Postgres port.
   427  func (n *Node) RPCPort() string {
   428  	if s := func() string {
   429  		// Legacy case. To be removed.
   430  		n.Lock()
   431  		defer n.Unlock()
   432  		if n.rpcPort != "" && n.rpcPort != "0" {
   433  			return n.rpcPort
   434  		}
   435  		return ""
   436  	}(); s != "" {
   437  		return s
   438  	}
   439  
   440  	advAddr := readFileOrEmpty(n.advertiseAddrFile())
   441  	if advAddr == "" {
   442  		return ""
   443  	}
   444  	_, p, _ := net.SplitHostPort(advAddr)
   445  	return p
   446  }
   447  
   448  // RPCAddr returns the RPC + Postgres address, or an empty string if it is not known
   449  // (for instance since the node is down).
   450  func (n *Node) RPCAddr() string {
   451  	port := n.RPCPort()
   452  	if port == "" || port == "0" {
   453  		return ""
   454  	}
   455  	return net.JoinHostPort(n.IPAddr(), port)
   456  }
   457  
   458  // HTTPAddr returns the HTTP address (once known).
   459  func (n *Node) HTTPAddr() string {
   460  	return readFileOrEmpty(n.httpAddrFile())
   461  }
   462  
   463  // PGUrl returns the postgres connection string (may be empty until known).
   464  func (n *Node) PGUrl() string {
   465  	n.Lock()
   466  	defer n.Unlock()
   467  	return n.pgURL
   468  }
   469  
   470  // Alive returns true if the node is alive (i.e. not stopped). Note that a
   471  // paused node is considered alive.
   472  func (n *Node) Alive() bool {
   473  	n.Lock()
   474  	defer n.Unlock()
   475  	return n.cmd != nil
   476  }
   477  
   478  // StatusClient returns a StatusClient set up to talk to this node.
   479  func (n *Node) StatusClient() serverpb.StatusClient {
   480  	n.Lock()
   481  	existingClient := n.statusClient
   482  	n.Unlock()
   483  
   484  	if existingClient != nil {
   485  		return existingClient
   486  	}
   487  
   488  	conn, _, err := n.rpcCtx.GRPCDialRaw(n.RPCAddr())
   489  	if err != nil {
   490  		log.Fatalf(context.Background(), "failed to initialize status client: %s", err)
   491  	}
   492  	return serverpb.NewStatusClient(conn)
   493  }
   494  
   495  func (n *Node) logDir() string {
   496  	if n.Cfg.LogDir == "" {
   497  		return filepath.Join(n.Cfg.DataDir, "logs")
   498  	}
   499  	return n.Cfg.LogDir
   500  }
   501  
   502  func (n *Node) listeningURLFile() string {
   503  	return filepath.Join(n.Cfg.DataDir, listeningURLFile)
   504  }
   505  
   506  // Start starts a node.
   507  func (n *Node) Start(ctx context.Context, joins ...string) {
   508  	if err := <-n.StartAsync(ctx, joins...); err != nil {
   509  		log.Fatalf(ctx, "%v", err)
   510  	}
   511  }
   512  
   513  func (n *Node) setNotRunningLocked(waitErr *exec.ExitError) {
   514  	_ = os.Remove(n.listeningURLFile())
   515  	_ = os.Remove(n.advertiseAddrFile())
   516  	_ = os.Remove(n.httpAddrFile())
   517  	if n.notRunning != nil {
   518  		close(n.notRunning)
   519  	}
   520  	n.notRunning = make(chan struct{})
   521  	n.db = nil
   522  	n.statusClient = nil
   523  	n.cmd = nil
   524  	n.rpcPort = ""
   525  	n.waitErr.Store(waitErr)
   526  	atomic.StoreInt32(&n.startSeq, 0)
   527  }
   528  
   529  func (n *Node) startAsyncInnerLocked(ctx context.Context, joins ...string) error {
   530  	n.setNotRunningLocked(nil)
   531  
   532  	args := append([]string(nil), n.Cfg.ExtraArgs[1:]...)
   533  	for _, join := range joins {
   534  		args = append(args, "--join", join)
   535  	}
   536  	n.cmd = exec.Command(n.Cfg.ExtraArgs[0], args...)
   537  	n.cmd.Env = os.Environ()
   538  	n.cmd.Env = append(n.cmd.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=5ms") // speed up rebalancing
   539  	n.cmd.Env = append(n.cmd.Env, n.Cfg.ExtraEnv...)
   540  
   541  	atomic.StoreInt32(&n.startSeq, n.seq.Next())
   542  
   543  	_ = os.MkdirAll(n.logDir(), 0755)
   544  
   545  	stdoutPath := filepath.Join(n.logDir(), "stdout")
   546  	stdout, err := os.OpenFile(stdoutPath, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
   547  	if err != nil {
   548  		return errors.Wrapf(err, "unable to open file %s", stdoutPath)
   549  	}
   550  	// This causes the "node startup header" to be printed to stdout, which is
   551  	// helpful and not too noisy.
   552  	n.cmd.Stdout = io.MultiWriter(stdout, os.Stdout)
   553  
   554  	stderrPath := filepath.Join(n.logDir(), "stderr")
   555  	stderr, err := os.OpenFile(stderrPath, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
   556  	if err != nil {
   557  		return errors.Wrapf(err, "unable to open file %s", stderrPath)
   558  	}
   559  	n.cmd.Stderr = stderr
   560  
   561  	if n.Cfg.RPCPort > 0 {
   562  		n.rpcPort = fmt.Sprintf("%d", n.Cfg.RPCPort)
   563  	}
   564  
   565  	if err := n.cmd.Start(); err != nil {
   566  		if err := stdout.Close(); err != nil {
   567  			log.Warningf(ctx, "%v", err)
   568  		}
   569  		if err := stderr.Close(); err != nil {
   570  			log.Warningf(ctx, "%v", err)
   571  		}
   572  		return errors.Wrapf(err, "running %s %v", n.cmd.Path, n.cmd.Args)
   573  	}
   574  
   575  	log.Infof(ctx, "process %d starting: %s", n.cmd.Process.Pid, n.cmd.Args)
   576  
   577  	go func(cmd *exec.Cmd) {
   578  		waitErr := cmd.Wait()
   579  		if waitErr != nil {
   580  			log.Warningf(ctx, "%v", waitErr)
   581  		}
   582  		if err := stdout.Close(); err != nil {
   583  			log.Warningf(ctx, "%v", err)
   584  		}
   585  		if err := stderr.Close(); err != nil {
   586  			log.Warningf(ctx, "%v", err)
   587  		}
   588  
   589  		log.Infof(ctx, "process %d: %s", cmd.Process.Pid, cmd.ProcessState)
   590  
   591  		var execErr *exec.ExitError
   592  		_ = errors.As(waitErr, &execErr)
   593  		n.Lock()
   594  		n.setNotRunningLocked(execErr)
   595  		n.Unlock()
   596  	}(n.cmd)
   597  
   598  	return nil
   599  }
   600  
   601  // StartAsync starts a node asynchronously. It returns a buffered channel that
   602  // receives either an error, or, once the node has started up and is fully
   603  // functional, `nil`.
   604  //
   605  // StartAsync is a no-op if the node is already running.
   606  func (n *Node) StartAsync(ctx context.Context, joins ...string) <-chan error {
   607  	ch := make(chan error, 1)
   608  
   609  	if err := func() error {
   610  		n.Lock()
   611  		defer n.Unlock()
   612  		if n.cmd != nil {
   613  			return errors.New("server is already running")
   614  		}
   615  		return n.startAsyncInnerLocked(ctx, joins...)
   616  	}(); err != nil {
   617  		ch <- err
   618  		return ch
   619  	}
   620  
   621  	go func() {
   622  		// If the node does not become live within a minute, something is wrong and
   623  		// it's better not to hang indefinitely.
   624  		ch <- n.waitUntilLive(time.Minute)
   625  	}()
   626  
   627  	return ch
   628  }
   629  
   630  func portFromURL(rawURL string) (string, *url.URL, error) {
   631  	u, err := url.Parse(rawURL)
   632  	if err != nil {
   633  		return "", nil, err
   634  	}
   635  
   636  	_, port, err := net.SplitHostPort(u.Host)
   637  	return port, u, err
   638  }
   639  
   640  func makeDB(url string, numWorkers int, dbName string) *gosql.DB {
   641  	conn, err := gosql.Open("postgres", url)
   642  	if err != nil {
   643  		log.Fatalf(context.Background(), "%v", err)
   644  	}
   645  	if numWorkers == 0 {
   646  		numWorkers = 1
   647  	}
   648  	conn.SetMaxOpenConns(numWorkers)
   649  	conn.SetMaxIdleConns(numWorkers)
   650  	return conn
   651  }
   652  
   653  func (n *Node) advertiseAddrFile() string {
   654  	return filepath.Join(n.Cfg.DataDir, "cockroach.advertise-addr")
   655  }
   656  
   657  func (n *Node) httpAddrFile() string {
   658  	return filepath.Join(n.Cfg.DataDir, "cockroach.http-addr")
   659  }
   660  
   661  func readFileOrEmpty(f string) string {
   662  	c, err := ioutil.ReadFile(f)
   663  	if err != nil {
   664  		if !os.IsNotExist(err) {
   665  			panic(err)
   666  		}
   667  		return ""
   668  	}
   669  	return string(c)
   670  }
   671  
   672  // AdvertiseAddr returns the Node's AdvertiseAddr or empty if none is available.
   673  func (n *Node) AdvertiseAddr() (s string) {
   674  	addr := readFileOrEmpty(n.advertiseAddrFile())
   675  	if addr != "" {
   676  		return addr
   677  	}
   678  	// The below is part of the workaround for nodes at v1.0 which don't
   679  	// write the file above, explained in more detail in StartAsync().
   680  	if port := n.RPCPort(); port != "" {
   681  		return net.JoinHostPort(n.IPAddr(), n.RPCPort())
   682  	}
   683  	return addr
   684  }
   685  
   686  func (n *Node) waitUntilLive(dur time.Duration) error {
   687  	ctx := context.Background()
   688  	closer := make(chan struct{})
   689  	defer time.AfterFunc(dur, func() { close(closer) }).Stop()
   690  	opts := retry.Options{
   691  		InitialBackoff: time.Millisecond,
   692  		MaxBackoff:     500 * time.Millisecond,
   693  		Multiplier:     2,
   694  		Closer:         closer,
   695  	}
   696  	for r := retry.Start(opts); r.Next(); {
   697  		var pid int
   698  		n.Lock()
   699  		if n.cmd != nil {
   700  			pid = n.cmd.Process.Pid
   701  		}
   702  		n.Unlock()
   703  		if pid == 0 {
   704  			log.Info(ctx, "process already quit")
   705  			return nil
   706  		}
   707  
   708  		urlBytes, err := ioutil.ReadFile(n.listeningURLFile())
   709  		if err != nil {
   710  			log.Infof(ctx, "%v", err)
   711  			continue
   712  		}
   713  
   714  		var pgURL *url.URL
   715  		_, pgURL, err = portFromURL(string(urlBytes))
   716  		if err != nil {
   717  			log.Infof(ctx, "%v", err)
   718  			continue
   719  		}
   720  
   721  		if n.Cfg.RPCPort == 0 {
   722  			n.Lock()
   723  			n.rpcPort = pgURL.Port()
   724  			n.Unlock()
   725  		}
   726  
   727  		pgURL.Path = n.Cfg.DB
   728  		n.Lock()
   729  		n.pgURL = pgURL.String()
   730  		n.Unlock()
   731  
   732  		var uiURL *url.URL
   733  
   734  		defer func() {
   735  			log.Infof(ctx, "process %d started (db: %s ui: %s)", pid, pgURL, uiURL)
   736  		}()
   737  
   738  		// We're basically running, but (at least) the decommissioning test sometimes starts
   739  		// up servers that can already be draining when they get here. For that reason, leave
   740  		// the admin port undefined if we don't manage to get it.
   741  		//
   742  		// This can be improved by making the below code run opportunistically whenever the
   743  		// http port is required but isn't initialized yet.
   744  		n.Lock()
   745  		n.db = makeDB(n.pgURL, n.Cfg.NumWorkers, n.Cfg.DB)
   746  		n.Unlock()
   747  
   748  		{
   749  			var uiStr string
   750  			if err := n.db.QueryRow(
   751  				`SELECT value FROM crdb_internal.node_runtime_info WHERE component='UI' AND field = 'URL'`,
   752  			).Scan(&uiStr); err != nil {
   753  				log.Infof(ctx, "%v", err)
   754  				return nil
   755  			}
   756  
   757  			_, uiURL, err = portFromURL(uiStr)
   758  			if err != nil {
   759  				log.Infof(ctx, "%v", err)
   760  				// TODO(tschottdorf): see above.
   761  			}
   762  		}
   763  		return nil
   764  	}
   765  	return errors.Errorf("node %+v was unable to join cluster within %s", n.Cfg, dur)
   766  }
   767  
   768  // Kill stops a node abruptly by sending it SIGKILL.
   769  func (n *Node) Kill() {
   770  	n.Signal(os.Kill)
   771  	// Wait for the process to have been cleaned up (or a call to Start() could
   772  	// turn into an unintended no-op).
   773  	for ok := false; !ok; {
   774  		n.Lock()
   775  		ok = n.cmd == nil
   776  		n.Unlock()
   777  	}
   778  }
   779  
   780  // IPAddr returns the node's listening address (for ui, inter-node, cli, and
   781  // Postgres alike).
   782  func (n *Node) IPAddr() string {
   783  	return n.Cfg.Addr
   784  }
   785  
   786  // DB returns a Postgres connection set up to talk to the node.
   787  func (n *Node) DB() *gosql.DB {
   788  	n.Lock()
   789  	defer n.Unlock()
   790  	return n.db
   791  }
   792  
   793  // Signal sends the given signal to the process. It is a no-op if the process is
   794  // not running.
   795  func (n *Node) Signal(s os.Signal) {
   796  	n.Lock()
   797  	defer n.Unlock()
   798  	if n.cmd == nil || n.cmd.Process == nil {
   799  		return
   800  	}
   801  	if err := n.cmd.Process.Signal(s); err != nil {
   802  		log.Warningf(context.Background(), "%v", err)
   803  	}
   804  }
   805  
   806  // Wait waits for the process to terminate and returns its process' Wait(). This
   807  // is nil if the process terminated with a zero exit code.
   808  func (n *Node) Wait() *exec.ExitError {
   809  	n.Lock()
   810  	ch := n.notRunning
   811  	n.Unlock()
   812  	if ch == nil {
   813  		log.Warning(context.Background(), "(*Node).Wait called when node was not running")
   814  		return nil
   815  	}
   816  	<-ch
   817  	ee, _ := n.waitErr.Load().(*exec.ExitError)
   818  	return ee
   819  }
   820  
   821  // Silence unused warning.
   822  var _ = (*Node)(nil).Wait