github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/acceptance/localcluster/cluster.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package localcluster 12 13 import ( 14 "bytes" 15 "context" 16 gosql "database/sql" 17 "fmt" 18 "go/build" 19 "io" 20 "io/ioutil" 21 "math/rand" 22 "net" 23 "net/url" 24 "os" 25 "os/exec" 26 "path/filepath" 27 "sort" 28 "strings" 29 "sync/atomic" 30 "text/tabwriter" 31 "time" 32 33 "github.com/cockroachdb/cockroach/pkg/base" 34 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 35 "github.com/cockroachdb/cockroach/pkg/roachpb" 36 "github.com/cockroachdb/cockroach/pkg/rpc" 37 "github.com/cockroachdb/cockroach/pkg/security" 38 "github.com/cockroachdb/cockroach/pkg/server/serverpb" 39 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 40 "github.com/cockroachdb/cockroach/pkg/testutils" 41 "github.com/cockroachdb/cockroach/pkg/util/hlc" 42 "github.com/cockroachdb/cockroach/pkg/util/log" 43 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 44 "github.com/cockroachdb/cockroach/pkg/util/retry" 45 "github.com/cockroachdb/cockroach/pkg/util/stop" 46 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 47 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 48 "github.com/cockroachdb/cockroach/pkg/util/tracing" 49 "github.com/cockroachdb/errors" 50 "github.com/gogo/protobuf/proto" 51 // Import postgres driver. 52 _ "github.com/lib/pq" 53 ) 54 55 func repoRoot() string { 56 root, err := build.Import("github.com/cockroachdb/cockroach", "", build.FindOnly) 57 if err != nil { 58 panic(fmt.Sprintf("must run from within the cockroach repository: %s", err)) 59 } 60 return root.Dir 61 } 62 63 // SourceBinary returns the path of the cockroach binary that was built with the 64 // local source. 65 func SourceBinary() string { 66 return filepath.Join(repoRoot(), "cockroach") 67 } 68 69 const listeningURLFile = "cockroachdb-url" 70 71 // IsUnavailableError returns true iff the error corresponds to a GRPC 72 // connection unavailable error. 73 func IsUnavailableError(err error) bool { 74 return strings.Contains(err.Error(), "grpc: the connection is unavailable") 75 } 76 77 // A ClusterConfig holds the configuration for a Cluster. 78 type ClusterConfig struct { 79 Ephemeral bool // when true, wipe DataDir on Close() 80 Binary string // path to cockroach, defaults go <cockroach_repo>/cockroach 81 AllNodeArgs []string // args to pass to ./cockroach on all nodes 82 NumNodes int // number of nodes in the cluster 83 DataDir string // node i will use storage DataDir/<i> 84 LogDir string // when empty, node i defaults to DataDir/<i>/logs 85 PerNodeCfg map[int]NodeConfig // optional map of nodeIndex -> configuration 86 DB string // database to configure DB connection for 87 NumWorkers int // SetMaxOpenConns to use for DB connection 88 NoWait bool // if set, return from Start before cluster ready 89 } 90 91 // NodeConfig is a configuration for a node in a Cluster. Options with the zero 92 // value are typically populated from the corresponding Cluster's ClusterConfig. 93 type NodeConfig struct { 94 Binary string // when specified, overrides the node's binary 95 DataDir string // when specified, overrides the node's data dir 96 LogDir string // when specified, overrides the node's log dir 97 Addr string // listening host, defaults to 127.0.0.1 98 ExtraArgs []string // extra arguments for ./cockroach start 99 ExtraEnv []string // environment variables in format key=value 100 RPCPort, HTTPPort int // zero for auto-assign 101 DB string // see ClusterConfig 102 NumWorkers int // see ClusterConfig 103 } 104 105 // MakePerNodeFixedPortsCfg makes a PerNodeCfg map of the given number of nodes 106 // with odd ports starting at 26257 for the RPC endpoint, and even points for 107 // the ui. 108 func MakePerNodeFixedPortsCfg(numNodes int) map[int]NodeConfig { 109 perNodeCfg := make(map[int]NodeConfig) 110 111 for i := 0; i < numNodes; i++ { 112 perNodeCfg[i] = NodeConfig{ 113 RPCPort: 26257 + 2*i, 114 HTTPPort: 26258 + 2*i, 115 } 116 } 117 118 return perNodeCfg 119 } 120 121 // Cluster holds the state for a local cluster, providing methods for common 122 // operations, access to the underlying nodes and per-node KV and SQL clients. 123 type Cluster struct { 124 Cfg ClusterConfig 125 seq *seqGen 126 Nodes []*Node 127 stopper *stop.Stopper 128 started time.Time 129 } 130 131 type seqGen int32 132 133 func (s *seqGen) Next() int32 { 134 return atomic.AddInt32((*int32)(s), 1) 135 } 136 137 // New creates a Cluster with the given configuration. 138 func New(cfg ClusterConfig) *Cluster { 139 if cfg.Binary == "" { 140 cfg.Binary = SourceBinary() 141 } 142 return &Cluster{ 143 Cfg: cfg, 144 seq: new(seqGen), 145 stopper: stop.NewStopper(), 146 } 147 } 148 149 // Start starts a cluster. The numWorkers parameter controls the SQL connection 150 // settings to avoid unnecessary connection creation. The allNodeArgs parameter 151 // can be used to pass extra arguments to every node. The perNodeArgs parameter 152 // can be used to pass extra arguments to an individual node. If not nil, its 153 // size must equal the number of nodes. 154 func (c *Cluster) Start(ctx context.Context) { 155 c.started = timeutil.Now() 156 157 chs := make([]<-chan error, c.Cfg.NumNodes) 158 for i := 0; i < c.Cfg.NumNodes; i++ { 159 cfg := c.Cfg.PerNodeCfg[i] // zero value is ok 160 if cfg.Binary == "" { 161 cfg.Binary = c.Cfg.Binary 162 } 163 if cfg.DataDir == "" { 164 cfg.DataDir = filepath.Join(c.Cfg.DataDir, fmt.Sprintf("%d", i+1)) 165 } 166 if cfg.LogDir == "" && c.Cfg.LogDir != "" { 167 cfg.LogDir = filepath.Join(c.Cfg.LogDir, fmt.Sprintf("%d", i+1)) 168 } 169 if cfg.Addr == "" { 170 cfg.Addr = "127.0.0.1" 171 } 172 if cfg.DB == "" { 173 cfg.DB = c.Cfg.DB 174 } 175 if cfg.NumWorkers == 0 { 176 cfg.NumWorkers = c.Cfg.NumWorkers 177 } 178 cfg.ExtraArgs = append(append([]string(nil), c.Cfg.AllNodeArgs...), cfg.ExtraArgs...) 179 var node *Node 180 node, chs[i] = c.makeNode(ctx, i, cfg) 181 c.Nodes = append(c.Nodes, node) 182 if i == 0 && cfg.RPCPort == 0 && c.Cfg.NumNodes > 1 { 183 // The first node must know its RPCPort or we can't possibly tell 184 // the other nodes the correct one to go to. 185 // 186 // Note: we can't set up a cluster first and clone it for each test, 187 // because all ports change so the cluster won't come together. 188 // Luckily, it takes only ~2 seconds from zero to a replicated 4 189 // node cluster. 190 if err := <-chs[0]; err != nil { 191 log.Fatalf(ctx, "while starting first node: %s", err) 192 } 193 ch := make(chan error) 194 close(ch) 195 chs[0] = ch 196 } 197 } 198 199 if !c.Cfg.NoWait { 200 for i := range chs { 201 if err := <-chs[i]; err != nil { 202 log.Fatalf(ctx, "node %d: %s", i+1, err) 203 } 204 } 205 } 206 207 log.Infof(context.Background(), "started %.3fs", timeutil.Since(c.started).Seconds()) 208 209 if c.Cfg.NumNodes > 1 || !c.Cfg.NoWait { 210 c.waitForFullReplication() 211 } else { 212 // NB: This is useful for TestRapidRestarts. 213 log.Infof(ctx, "not waiting for initial replication") 214 } 215 } 216 217 // Close stops the cluster, killing all of the nodes. 218 func (c *Cluster) Close() { 219 for _, n := range c.Nodes { 220 n.Kill() 221 } 222 c.stopper.Stop(context.Background()) 223 if c.Cfg.Ephemeral { 224 _ = os.RemoveAll(c.Cfg.DataDir) 225 } 226 } 227 228 func (c *Cluster) joins() []string { 229 type addrAndSeq struct { 230 addr string 231 seq int32 232 } 233 234 var joins []addrAndSeq 235 for _, node := range c.Nodes { 236 advertAddr := node.AdvertiseAddr() 237 if advertAddr != "" { 238 joins = append(joins, addrAndSeq{ 239 addr: advertAddr, 240 seq: atomic.LoadInt32(&node.startSeq), 241 }) 242 } 243 } 244 sort.Slice(joins, func(i, j int) bool { 245 return joins[i].seq < joins[j].seq 246 }) 247 248 if len(joins) == 0 { 249 return nil 250 } 251 252 // Return the node with the smallest startSeq, i.e. the node that was 253 // started first. This is the node that might have no --join flag set, and 254 // we must point the other nodes at it, and *only* at it (or the other nodes 255 // may connect sufficiently and never bother to talk to this node). 256 return []string{joins[0].addr} 257 } 258 259 // IPAddr returns the IP address of the specified node. 260 func (c *Cluster) IPAddr(nodeIdx int) string { 261 return c.Nodes[nodeIdx].IPAddr() 262 } 263 264 // RPCPort returns the RPC port of the specified node. Returns zero if unknown. 265 func (c *Cluster) RPCPort(nodeIdx int) string { 266 return c.Nodes[nodeIdx].RPCPort() 267 } 268 269 func (c *Cluster) makeNode(ctx context.Context, nodeIdx int, cfg NodeConfig) (*Node, <-chan error) { 270 baseCtx := &base.Config{ 271 User: security.NodeUser, 272 Insecure: true, 273 } 274 rpcCtx := rpc.NewContext(log.AmbientContext{Tracer: tracing.NewTracer()}, baseCtx, 275 hlc.NewClock(hlc.UnixNano, 0), c.stopper, cluster.MakeTestingClusterSettings()) 276 277 n := &Node{ 278 Cfg: cfg, 279 rpcCtx: rpcCtx, 280 seq: c.seq, 281 } 282 283 args := []string{ 284 cfg.Binary, 285 "start", 286 "--insecure", 287 // Although --host/--port are deprecated, we cannot yet replace 288 // this here by --listen-addr/--listen-port, because 289 // TestVersionUpgrade will also try old binaries. 290 fmt.Sprintf("--host=%s", n.IPAddr()), 291 fmt.Sprintf("--port=%d", cfg.RPCPort), 292 fmt.Sprintf("--http-port=%d", cfg.HTTPPort), 293 fmt.Sprintf("--store=%s", cfg.DataDir), 294 fmt.Sprintf("--listening-url-file=%s", n.listeningURLFile()), 295 fmt.Sprintf("--cache=256MiB"), 296 } 297 298 if n.Cfg.LogDir != "" { 299 args = append(args, fmt.Sprintf("--log-dir=%s", n.Cfg.LogDir)) 300 } 301 302 n.Cfg.ExtraArgs = append(args, cfg.ExtraArgs...) 303 304 if err := os.MkdirAll(n.logDir(), 0755); err != nil { 305 log.Fatalf(context.Background(), "%v", err) 306 } 307 308 joins := c.joins() 309 if nodeIdx > 0 && len(joins) == 0 { 310 ch := make(chan error, 1) 311 ch <- errors.Errorf("node %d started without join flags", nodeIdx+1) 312 return nil, ch 313 } 314 ch := n.StartAsync(ctx, joins...) 315 return n, ch 316 } 317 318 // waitForFullReplication waits for the cluster to be fully replicated. 319 func (c *Cluster) waitForFullReplication() { 320 for i := 1; true; i++ { 321 done, detail := c.isReplicated() 322 if (done && i >= 50) || (i%50) == 0 { 323 fmt.Print(detail) 324 log.Infof(context.Background(), "waiting for replication") 325 } 326 if done { 327 break 328 } 329 time.Sleep(100 * time.Millisecond) 330 } 331 332 log.Infof(context.Background(), "replicated %.3fs", timeutil.Since(c.started).Seconds()) 333 } 334 335 func (c *Cluster) isReplicated() (bool, string) { 336 db := c.Nodes[0].DB() 337 rows, err := db.Query(`SELECT range_id, start_key, end_key, array_length(replicas, 1) FROM crdb_internal.ranges`) 338 if err != nil { 339 // Versions <= 1.1 do not contain the crdb_internal table, which is what's used 340 // to determine whether a cluster has up-replicated. This is relevant for the 341 // version upgrade acceptance test. Just skip the replication check for this case. 342 if testutils.IsError(err, "(table|relation) \"crdb_internal.ranges\" does not exist") { 343 return true, "" 344 } 345 log.Fatalf(context.Background(), "%v", err) 346 } 347 defer rows.Close() 348 349 var buf bytes.Buffer 350 tw := tabwriter.NewWriter(&buf, 2, 1, 2, ' ', 0) 351 done := true 352 for rows.Next() { 353 var rangeID int64 354 var startKey, endKey roachpb.Key 355 var numReplicas int 356 if err := rows.Scan(&rangeID, &startKey, &endKey, &numReplicas); err != nil { 357 log.Fatalf(context.Background(), "unable to scan range replicas: %s", err) 358 } 359 fmt.Fprintf(tw, "\t%s\t%s\t[%d]\t%d\n", startKey, endKey, rangeID, numReplicas) 360 // This check is coarse since it doesn't know the real configuration. 361 // Assume all is well when there are 3+ replicas, or if there are as 362 // many replicas as there are nodes. 363 if numReplicas < 3 && numReplicas != len(c.Nodes) { 364 done = false 365 } 366 } 367 _ = tw.Flush() 368 return done, buf.String() 369 } 370 371 // UpdateZoneConfig updates the default zone config for the cluster. 372 func (c *Cluster) UpdateZoneConfig(rangeMinBytes, rangeMaxBytes int64) { 373 zone := zonepb.DefaultZoneConfig() 374 zone.RangeMinBytes = proto.Int64(rangeMinBytes) 375 zone.RangeMaxBytes = proto.Int64(rangeMaxBytes) 376 377 buf, err := protoutil.Marshal(&zone) 378 if err != nil { 379 log.Fatalf(context.Background(), "%v", err) 380 } 381 _, err = c.Nodes[0].DB().Exec(`UPSERT INTO system.zones (id, config) VALUES (0, $1)`, buf) 382 if err != nil { 383 log.Fatalf(context.Background(), "%v", err) 384 } 385 } 386 387 // Split splits the range containing the split key at the specified split key. 388 func (c *Cluster) Split(nodeIdx int, splitKey roachpb.Key) error { 389 return errors.Errorf("Split is unimplemented and should be re-implemented using SQL") 390 } 391 392 // TransferLease transfers the lease for the range containing key to a random 393 // alive node in the range. 394 func (c *Cluster) TransferLease(nodeIdx int, r *rand.Rand, key roachpb.Key) (bool, error) { 395 return false, errors.Errorf("TransferLease is unimplemented and should be re-implemented using SQL") 396 } 397 398 // RandNode returns the index of a random alive node. 399 func (c *Cluster) RandNode(f func(int) int) int { 400 for { 401 i := f(len(c.Nodes)) 402 if c.Nodes[i].Alive() { 403 return i 404 } 405 } 406 } 407 408 // Node holds the state for a single node in a local cluster and provides 409 // methods for starting, pausing, resuming and stopping the node. 410 type Node struct { 411 Cfg NodeConfig 412 rpcCtx *rpc.Context 413 seq *seqGen 414 415 startSeq int32 // updated atomically on start, nonzero while running 416 waitErr atomic.Value // last `error`` returned from cmd.Wait() 417 418 syncutil.Mutex 419 notRunning chan struct{} 420 cmd *exec.Cmd 421 rpcPort, pgURL string // legacy: remove once 1.0.x is no longer tested 422 db *gosql.DB 423 statusClient serverpb.StatusClient 424 } 425 426 // RPCPort returns the RPC + Postgres port. 427 func (n *Node) RPCPort() string { 428 if s := func() string { 429 // Legacy case. To be removed. 430 n.Lock() 431 defer n.Unlock() 432 if n.rpcPort != "" && n.rpcPort != "0" { 433 return n.rpcPort 434 } 435 return "" 436 }(); s != "" { 437 return s 438 } 439 440 advAddr := readFileOrEmpty(n.advertiseAddrFile()) 441 if advAddr == "" { 442 return "" 443 } 444 _, p, _ := net.SplitHostPort(advAddr) 445 return p 446 } 447 448 // RPCAddr returns the RPC + Postgres address, or an empty string if it is not known 449 // (for instance since the node is down). 450 func (n *Node) RPCAddr() string { 451 port := n.RPCPort() 452 if port == "" || port == "0" { 453 return "" 454 } 455 return net.JoinHostPort(n.IPAddr(), port) 456 } 457 458 // HTTPAddr returns the HTTP address (once known). 459 func (n *Node) HTTPAddr() string { 460 return readFileOrEmpty(n.httpAddrFile()) 461 } 462 463 // PGUrl returns the postgres connection string (may be empty until known). 464 func (n *Node) PGUrl() string { 465 n.Lock() 466 defer n.Unlock() 467 return n.pgURL 468 } 469 470 // Alive returns true if the node is alive (i.e. not stopped). Note that a 471 // paused node is considered alive. 472 func (n *Node) Alive() bool { 473 n.Lock() 474 defer n.Unlock() 475 return n.cmd != nil 476 } 477 478 // StatusClient returns a StatusClient set up to talk to this node. 479 func (n *Node) StatusClient() serverpb.StatusClient { 480 n.Lock() 481 existingClient := n.statusClient 482 n.Unlock() 483 484 if existingClient != nil { 485 return existingClient 486 } 487 488 conn, _, err := n.rpcCtx.GRPCDialRaw(n.RPCAddr()) 489 if err != nil { 490 log.Fatalf(context.Background(), "failed to initialize status client: %s", err) 491 } 492 return serverpb.NewStatusClient(conn) 493 } 494 495 func (n *Node) logDir() string { 496 if n.Cfg.LogDir == "" { 497 return filepath.Join(n.Cfg.DataDir, "logs") 498 } 499 return n.Cfg.LogDir 500 } 501 502 func (n *Node) listeningURLFile() string { 503 return filepath.Join(n.Cfg.DataDir, listeningURLFile) 504 } 505 506 // Start starts a node. 507 func (n *Node) Start(ctx context.Context, joins ...string) { 508 if err := <-n.StartAsync(ctx, joins...); err != nil { 509 log.Fatalf(ctx, "%v", err) 510 } 511 } 512 513 func (n *Node) setNotRunningLocked(waitErr *exec.ExitError) { 514 _ = os.Remove(n.listeningURLFile()) 515 _ = os.Remove(n.advertiseAddrFile()) 516 _ = os.Remove(n.httpAddrFile()) 517 if n.notRunning != nil { 518 close(n.notRunning) 519 } 520 n.notRunning = make(chan struct{}) 521 n.db = nil 522 n.statusClient = nil 523 n.cmd = nil 524 n.rpcPort = "" 525 n.waitErr.Store(waitErr) 526 atomic.StoreInt32(&n.startSeq, 0) 527 } 528 529 func (n *Node) startAsyncInnerLocked(ctx context.Context, joins ...string) error { 530 n.setNotRunningLocked(nil) 531 532 args := append([]string(nil), n.Cfg.ExtraArgs[1:]...) 533 for _, join := range joins { 534 args = append(args, "--join", join) 535 } 536 n.cmd = exec.Command(n.Cfg.ExtraArgs[0], args...) 537 n.cmd.Env = os.Environ() 538 n.cmd.Env = append(n.cmd.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=5ms") // speed up rebalancing 539 n.cmd.Env = append(n.cmd.Env, n.Cfg.ExtraEnv...) 540 541 atomic.StoreInt32(&n.startSeq, n.seq.Next()) 542 543 _ = os.MkdirAll(n.logDir(), 0755) 544 545 stdoutPath := filepath.Join(n.logDir(), "stdout") 546 stdout, err := os.OpenFile(stdoutPath, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) 547 if err != nil { 548 return errors.Wrapf(err, "unable to open file %s", stdoutPath) 549 } 550 // This causes the "node startup header" to be printed to stdout, which is 551 // helpful and not too noisy. 552 n.cmd.Stdout = io.MultiWriter(stdout, os.Stdout) 553 554 stderrPath := filepath.Join(n.logDir(), "stderr") 555 stderr, err := os.OpenFile(stderrPath, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) 556 if err != nil { 557 return errors.Wrapf(err, "unable to open file %s", stderrPath) 558 } 559 n.cmd.Stderr = stderr 560 561 if n.Cfg.RPCPort > 0 { 562 n.rpcPort = fmt.Sprintf("%d", n.Cfg.RPCPort) 563 } 564 565 if err := n.cmd.Start(); err != nil { 566 if err := stdout.Close(); err != nil { 567 log.Warningf(ctx, "%v", err) 568 } 569 if err := stderr.Close(); err != nil { 570 log.Warningf(ctx, "%v", err) 571 } 572 return errors.Wrapf(err, "running %s %v", n.cmd.Path, n.cmd.Args) 573 } 574 575 log.Infof(ctx, "process %d starting: %s", n.cmd.Process.Pid, n.cmd.Args) 576 577 go func(cmd *exec.Cmd) { 578 waitErr := cmd.Wait() 579 if waitErr != nil { 580 log.Warningf(ctx, "%v", waitErr) 581 } 582 if err := stdout.Close(); err != nil { 583 log.Warningf(ctx, "%v", err) 584 } 585 if err := stderr.Close(); err != nil { 586 log.Warningf(ctx, "%v", err) 587 } 588 589 log.Infof(ctx, "process %d: %s", cmd.Process.Pid, cmd.ProcessState) 590 591 var execErr *exec.ExitError 592 _ = errors.As(waitErr, &execErr) 593 n.Lock() 594 n.setNotRunningLocked(execErr) 595 n.Unlock() 596 }(n.cmd) 597 598 return nil 599 } 600 601 // StartAsync starts a node asynchronously. It returns a buffered channel that 602 // receives either an error, or, once the node has started up and is fully 603 // functional, `nil`. 604 // 605 // StartAsync is a no-op if the node is already running. 606 func (n *Node) StartAsync(ctx context.Context, joins ...string) <-chan error { 607 ch := make(chan error, 1) 608 609 if err := func() error { 610 n.Lock() 611 defer n.Unlock() 612 if n.cmd != nil { 613 return errors.New("server is already running") 614 } 615 return n.startAsyncInnerLocked(ctx, joins...) 616 }(); err != nil { 617 ch <- err 618 return ch 619 } 620 621 go func() { 622 // If the node does not become live within a minute, something is wrong and 623 // it's better not to hang indefinitely. 624 ch <- n.waitUntilLive(time.Minute) 625 }() 626 627 return ch 628 } 629 630 func portFromURL(rawURL string) (string, *url.URL, error) { 631 u, err := url.Parse(rawURL) 632 if err != nil { 633 return "", nil, err 634 } 635 636 _, port, err := net.SplitHostPort(u.Host) 637 return port, u, err 638 } 639 640 func makeDB(url string, numWorkers int, dbName string) *gosql.DB { 641 conn, err := gosql.Open("postgres", url) 642 if err != nil { 643 log.Fatalf(context.Background(), "%v", err) 644 } 645 if numWorkers == 0 { 646 numWorkers = 1 647 } 648 conn.SetMaxOpenConns(numWorkers) 649 conn.SetMaxIdleConns(numWorkers) 650 return conn 651 } 652 653 func (n *Node) advertiseAddrFile() string { 654 return filepath.Join(n.Cfg.DataDir, "cockroach.advertise-addr") 655 } 656 657 func (n *Node) httpAddrFile() string { 658 return filepath.Join(n.Cfg.DataDir, "cockroach.http-addr") 659 } 660 661 func readFileOrEmpty(f string) string { 662 c, err := ioutil.ReadFile(f) 663 if err != nil { 664 if !os.IsNotExist(err) { 665 panic(err) 666 } 667 return "" 668 } 669 return string(c) 670 } 671 672 // AdvertiseAddr returns the Node's AdvertiseAddr or empty if none is available. 673 func (n *Node) AdvertiseAddr() (s string) { 674 addr := readFileOrEmpty(n.advertiseAddrFile()) 675 if addr != "" { 676 return addr 677 } 678 // The below is part of the workaround for nodes at v1.0 which don't 679 // write the file above, explained in more detail in StartAsync(). 680 if port := n.RPCPort(); port != "" { 681 return net.JoinHostPort(n.IPAddr(), n.RPCPort()) 682 } 683 return addr 684 } 685 686 func (n *Node) waitUntilLive(dur time.Duration) error { 687 ctx := context.Background() 688 closer := make(chan struct{}) 689 defer time.AfterFunc(dur, func() { close(closer) }).Stop() 690 opts := retry.Options{ 691 InitialBackoff: time.Millisecond, 692 MaxBackoff: 500 * time.Millisecond, 693 Multiplier: 2, 694 Closer: closer, 695 } 696 for r := retry.Start(opts); r.Next(); { 697 var pid int 698 n.Lock() 699 if n.cmd != nil { 700 pid = n.cmd.Process.Pid 701 } 702 n.Unlock() 703 if pid == 0 { 704 log.Info(ctx, "process already quit") 705 return nil 706 } 707 708 urlBytes, err := ioutil.ReadFile(n.listeningURLFile()) 709 if err != nil { 710 log.Infof(ctx, "%v", err) 711 continue 712 } 713 714 var pgURL *url.URL 715 _, pgURL, err = portFromURL(string(urlBytes)) 716 if err != nil { 717 log.Infof(ctx, "%v", err) 718 continue 719 } 720 721 if n.Cfg.RPCPort == 0 { 722 n.Lock() 723 n.rpcPort = pgURL.Port() 724 n.Unlock() 725 } 726 727 pgURL.Path = n.Cfg.DB 728 n.Lock() 729 n.pgURL = pgURL.String() 730 n.Unlock() 731 732 var uiURL *url.URL 733 734 defer func() { 735 log.Infof(ctx, "process %d started (db: %s ui: %s)", pid, pgURL, uiURL) 736 }() 737 738 // We're basically running, but (at least) the decommissioning test sometimes starts 739 // up servers that can already be draining when they get here. For that reason, leave 740 // the admin port undefined if we don't manage to get it. 741 // 742 // This can be improved by making the below code run opportunistically whenever the 743 // http port is required but isn't initialized yet. 744 n.Lock() 745 n.db = makeDB(n.pgURL, n.Cfg.NumWorkers, n.Cfg.DB) 746 n.Unlock() 747 748 { 749 var uiStr string 750 if err := n.db.QueryRow( 751 `SELECT value FROM crdb_internal.node_runtime_info WHERE component='UI' AND field = 'URL'`, 752 ).Scan(&uiStr); err != nil { 753 log.Infof(ctx, "%v", err) 754 return nil 755 } 756 757 _, uiURL, err = portFromURL(uiStr) 758 if err != nil { 759 log.Infof(ctx, "%v", err) 760 // TODO(tschottdorf): see above. 761 } 762 } 763 return nil 764 } 765 return errors.Errorf("node %+v was unable to join cluster within %s", n.Cfg, dur) 766 } 767 768 // Kill stops a node abruptly by sending it SIGKILL. 769 func (n *Node) Kill() { 770 n.Signal(os.Kill) 771 // Wait for the process to have been cleaned up (or a call to Start() could 772 // turn into an unintended no-op). 773 for ok := false; !ok; { 774 n.Lock() 775 ok = n.cmd == nil 776 n.Unlock() 777 } 778 } 779 780 // IPAddr returns the node's listening address (for ui, inter-node, cli, and 781 // Postgres alike). 782 func (n *Node) IPAddr() string { 783 return n.Cfg.Addr 784 } 785 786 // DB returns a Postgres connection set up to talk to the node. 787 func (n *Node) DB() *gosql.DB { 788 n.Lock() 789 defer n.Unlock() 790 return n.db 791 } 792 793 // Signal sends the given signal to the process. It is a no-op if the process is 794 // not running. 795 func (n *Node) Signal(s os.Signal) { 796 n.Lock() 797 defer n.Unlock() 798 if n.cmd == nil || n.cmd.Process == nil { 799 return 800 } 801 if err := n.cmd.Process.Signal(s); err != nil { 802 log.Warningf(context.Background(), "%v", err) 803 } 804 } 805 806 // Wait waits for the process to terminate and returns its process' Wait(). This 807 // is nil if the process terminated with a zero exit code. 808 func (n *Node) Wait() *exec.ExitError { 809 n.Lock() 810 ch := n.notRunning 811 n.Unlock() 812 if ch == nil { 813 log.Warning(context.Background(), "(*Node).Wait called when node was not running") 814 return nil 815 } 816 <-ch 817 ee, _ := n.waitErr.Load().(*exec.ExitError) 818 return ee 819 } 820 821 // Silence unused warning. 822 var _ = (*Node)(nil).Wait