github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/cluster.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "bufio" 15 "bytes" 16 "context" 17 gosql "database/sql" 18 "encoding/json" 19 "fmt" 20 "io" 21 "io/ioutil" 22 "math/rand" 23 "net" 24 "net/url" 25 "os" 26 "os/exec" 27 "os/user" 28 "path/filepath" 29 "regexp" 30 "sort" 31 "strconv" 32 "strings" 33 "sync" 34 "sync/atomic" 35 "time" 36 37 "github.com/armon/circbuf" 38 "github.com/cockroachdb/cockroach/pkg/util/contextutil" 39 "github.com/cockroachdb/cockroach/pkg/util/log" 40 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 41 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 42 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 43 "github.com/cockroachdb/errors" 44 _ "github.com/lib/pq" 45 "golang.org/x/sync/errgroup" 46 ) 47 48 const ( 49 aws = "aws" 50 gce = "gce" 51 azure = "azure" 52 ) 53 54 var ( 55 local bool 56 cockroach string 57 cloud = gce 58 encrypt encryptValue = "false" 59 instanceType string 60 workload string 61 roachprod string 62 buildTag string 63 clusterName string 64 clusterWipe bool 65 zonesF string 66 teamCity bool 67 ) 68 69 type encryptValue string 70 71 func (v *encryptValue) String() string { 72 return string(*v) 73 } 74 75 func (v *encryptValue) Set(s string) error { 76 if s == "random" { 77 *v = encryptValue(s) 78 return nil 79 } 80 t, err := strconv.ParseBool(s) 81 if err != nil { 82 return err 83 } 84 *v = encryptValue(fmt.Sprint(t)) 85 return nil 86 } 87 88 func (v *encryptValue) asBool() bool { 89 if *v == "random" { 90 return rand.Intn(2) == 0 91 } 92 t, err := strconv.ParseBool(string(*v)) 93 if err != nil { 94 return false 95 } 96 return t 97 } 98 99 func (v *encryptValue) Type() string { 100 return "string" 101 } 102 103 func ifLocal(trueVal, falseVal string) string { 104 if local { 105 return trueVal 106 } 107 return falseVal 108 } 109 110 func filepathAbs(path string) (string, error) { 111 path, err := filepath.Abs(path) 112 if err != nil { 113 return "", errors.WithStack(err) 114 } 115 return path, nil 116 } 117 118 func findBinary(binary, defValue string) (string, error) { 119 if binary == "" { 120 binary = defValue 121 } 122 123 // Check to see if binary exists and is a regular file and executable. 124 if fi, err := os.Stat(binary); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 { 125 return filepathAbs(binary) 126 } 127 128 // Find the binary to run and translate it to an absolute path. First, look 129 // for the binary in PATH. 130 path, err := exec.LookPath(binary) 131 if err != nil { 132 if strings.HasPrefix(binary, "/") { 133 return "", errors.WithStack(err) 134 } 135 // We're unable to find the binary in PATH and "binary" is a relative path: 136 // look in the cockroach repo. 137 gopath := os.Getenv("GOPATH") 138 if gopath == "" { 139 gopath = filepath.Join(os.Getenv("HOME"), "go") 140 } 141 142 var binSuffix string 143 if !local { 144 binSuffix = ".docker_amd64" 145 } 146 dirs := []string{ 147 filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"), 148 filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/bin"+binSuffix), 149 filepath.Join(os.ExpandEnv("$PWD"), "bin"+binSuffix), 150 } 151 for _, dir := range dirs { 152 path = filepath.Join(dir, binary) 153 var err2 error 154 path, err2 = exec.LookPath(path) 155 if err2 == nil { 156 return filepathAbs(path) 157 } 158 } 159 return "", fmt.Errorf("failed to find %q in $PATH or any of %s", binary, dirs) 160 } 161 return filepathAbs(path) 162 } 163 164 func initBinaries() { 165 // If we're running against an existing "local" cluster, force the local flag 166 // to true in order to get the "local" test configurations. 167 if clusterName == "local" { 168 local = true 169 } 170 171 cockroachDefault := "cockroach" 172 if !local { 173 cockroachDefault = "cockroach-linux-2.6.32-gnu-amd64" 174 } 175 var err error 176 cockroach, err = findBinary(cockroach, cockroachDefault) 177 if err != nil { 178 fmt.Fprintf(os.Stderr, "%+v\n", err) 179 os.Exit(1) 180 } 181 182 workload, err = findBinary(workload, "workload") 183 if err != nil { 184 fmt.Fprintf(os.Stderr, "%+v\n", err) 185 os.Exit(1) 186 } 187 188 roachprod, err = findBinary(roachprod, "roachprod") 189 if err != nil { 190 fmt.Fprintf(os.Stderr, "%+v\n", err) 191 os.Exit(1) 192 } 193 } 194 195 type clusterRegistry struct { 196 mu struct { 197 syncutil.Mutex 198 clusters map[string]*cluster 199 tagCount map[string]int 200 // savedClusters keeps track of clusters that have been saved for further 201 // debugging. Each cluster comes with a message about the test failure 202 // causing it to be saved for debugging. 203 savedClusters map[*cluster]string 204 } 205 } 206 207 func newClusterRegistry() *clusterRegistry { 208 cr := &clusterRegistry{} 209 cr.mu.clusters = make(map[string]*cluster) 210 cr.mu.savedClusters = make(map[*cluster]string) 211 return cr 212 } 213 214 func (r *clusterRegistry) registerCluster(c *cluster) error { 215 r.mu.Lock() 216 defer r.mu.Unlock() 217 if r.mu.clusters[c.name] != nil { 218 return fmt.Errorf("cluster named %q already exists in registry", c.name) 219 } 220 r.mu.clusters[c.name] = c 221 return nil 222 } 223 224 func (r *clusterRegistry) unregisterCluster(c *cluster) bool { 225 r.mu.Lock() 226 defer r.mu.Unlock() 227 if _, ok := r.mu.clusters[c.name]; !ok { 228 // If the cluster is not registered, no-op. This allows the 229 // method to be called defensively. 230 return false 231 } 232 delete(r.mu.clusters, c.name) 233 if c.tag != "" { 234 if _, ok := r.mu.tagCount[c.tag]; !ok { 235 panic(fmt.Sprintf("tagged cluster not accounted for: %s", c)) 236 } 237 r.mu.tagCount[c.tag]-- 238 } 239 return true 240 } 241 242 func (r *clusterRegistry) countForTag(tag string) int { 243 r.mu.Lock() 244 defer r.mu.Unlock() 245 return r.mu.tagCount[tag] 246 } 247 248 // markClusterAsSaved marks c such that it will not be destroyed by 249 // destroyAllClusters. 250 // msg is a message recording the reason why the cluster is being saved (i.e. 251 // generally a test failure error). 252 func (r *clusterRegistry) markClusterAsSaved(c *cluster, msg string) { 253 r.mu.Lock() 254 r.mu.savedClusters[c] = msg 255 r.mu.Unlock() 256 } 257 258 type clusterWithMsg struct { 259 *cluster 260 savedMsg string 261 } 262 263 // savedClusters returns the list of clusters that have been saved for 264 // debugging. 265 func (r *clusterRegistry) savedClusters() []clusterWithMsg { 266 r.mu.Lock() 267 defer r.mu.Unlock() 268 res := make([]clusterWithMsg, len(r.mu.savedClusters)) 269 i := 0 270 for c, msg := range r.mu.savedClusters { 271 res[i] = clusterWithMsg{ 272 cluster: c, 273 savedMsg: msg, 274 } 275 i++ 276 } 277 sort.Slice(res, func(i, j int) bool { 278 return strings.Compare(res[i].name, res[j].name) < 0 279 }) 280 return res 281 } 282 283 // destroyAllClusters destroys all the clusters (except for "saved" ones) and 284 // blocks until they're destroyed. It responds to context cancelation by 285 // interrupting the waiting; the cluster destruction itself does not inherit the 286 // cancelation. 287 func (r *clusterRegistry) destroyAllClusters(ctx context.Context, l *logger) { 288 // Fire off a goroutine to destroy all of the clusters. 289 done := make(chan struct{}) 290 go func() { 291 defer close(done) 292 293 var clusters []*cluster 294 savedClusters := make(map[*cluster]struct{}) 295 r.mu.Lock() 296 for _, c := range r.mu.clusters { 297 clusters = append(clusters, c) 298 } 299 for c := range r.mu.savedClusters { 300 savedClusters[c] = struct{}{} 301 } 302 r.mu.Unlock() 303 304 var wg sync.WaitGroup 305 wg.Add(len(clusters)) 306 for _, c := range clusters { 307 go func(c *cluster) { 308 defer wg.Done() 309 if _, ok := savedClusters[c]; !ok { 310 // We don't close the logger here since the cluster may be still in use 311 // by a test, and so the logger might still be needed. 312 c.Destroy(ctx, dontCloseLogger, l) 313 } 314 }(c) 315 } 316 317 wg.Wait() 318 }() 319 320 select { 321 case <-done: 322 case <-ctx.Done(): 323 } 324 } 325 326 // execCmd is like execCmdEx, but doesn't return the command's output. 327 func execCmd(ctx context.Context, l *logger, args ...string) error { 328 return execCmdEx(ctx, l, args...).err 329 } 330 331 type cmdRes struct { 332 err error 333 // stdout and stderr are the commands output. Note that this is truncated and 334 // only a tail is returned. 335 stdout, stderr string 336 } 337 338 // execCmdEx runs a command and returns its error and output. 339 // 340 // Note that the output is truncated; only a tail is returned. 341 // Also note that if the command exits with an error code, its output is also 342 // included in cmdRes.err. 343 func execCmdEx(ctx context.Context, l *logger, args ...string) cmdRes { 344 var cancel func() 345 ctx, cancel = context.WithCancel(ctx) 346 defer cancel() 347 348 l.Printf("> %s\n", strings.Join(args, " ")) 349 cmd := exec.CommandContext(ctx, args[0], args[1:]...) 350 351 debugStdoutBuffer, _ := circbuf.NewBuffer(4096) 352 debugStderrBuffer, _ := circbuf.NewBuffer(1024) 353 354 // Do a dance around https://github.com/golang/go/issues/23019. 355 // When the command we run launches a subprocess, that subprocess receives 356 // a copy of our Command's Stdout/Stderr file descriptor, which effectively 357 // means that the file descriptors close only when that subcommand returns. 358 // However, proactively killing the subcommand is not really possible - we 359 // will only manage to kill the parent process that we launched directly. 360 // In practice this means that if we try to react to context cancellation, 361 // the pipes we read the output from will wait for the *subprocess* to 362 // terminate, leaving us hanging, potentially indefinitely. 363 // To work around it, use pipes and set a read deadline on our (read) end of 364 // the pipes when we detect a context cancellation. 365 // 366 // See TestExecCmd for a test. 367 var closePipes func(ctx context.Context) 368 var wg sync.WaitGroup 369 { 370 371 var wOut, wErr, rOut, rErr *os.File 372 var cwOnce sync.Once 373 closePipes = func(ctx context.Context) { 374 // Idempotently closes the writing end of the pipes. This is called either 375 // when the process returns or when it was killed due to context 376 // cancellation. In the former case, close the writing ends of the pipe 377 // so that the copy goroutines started below return (without missing any 378 // output). In the context cancellation case, we set a deadline to force 379 // the goroutines to quit eagerly. This is important since the command 380 // may have duplicated wOut and wErr to its possible subprocesses, which 381 // may continue to run for long periods of time, and would otherwise 382 // block this command. In theory this is possible also when the command 383 // returns on its own accord, so we set a (more lenient) deadline in the 384 // first case as well. 385 // 386 // NB: there's also the option (at least on *nix) to use a process group, 387 // but it doesn't look portable: 388 // https://medium.com/@felixge/killing-a-child-process-and-all-of-its-children-in-go-54079af94773 389 cwOnce.Do(func() { 390 if wOut != nil { 391 _ = wOut.Close() 392 } 393 if wErr != nil { 394 _ = wErr.Close() 395 } 396 dur := 10 * time.Second // wait up to 10s for subprocesses 397 if ctx.Err() != nil { 398 dur = 10 * time.Millisecond 399 } 400 deadline := timeutil.Now().Add(dur) 401 if rOut != nil { 402 _ = rOut.SetReadDeadline(deadline) 403 } 404 if rErr != nil { 405 _ = rErr.SetReadDeadline(deadline) 406 } 407 }) 408 } 409 defer closePipes(ctx) 410 411 var err error 412 rOut, wOut, err = os.Pipe() 413 if err != nil { 414 return cmdRes{err: err} 415 } 416 417 rErr, wErr, err = os.Pipe() 418 if err != nil { 419 return cmdRes{err: err} 420 } 421 422 cmd.Stdout = wOut 423 wg.Add(1) 424 go func() { 425 defer wg.Done() 426 _, _ = io.Copy(l.stdout, io.TeeReader(rOut, debugStdoutBuffer)) 427 }() 428 429 if l.stderr == l.stdout { 430 // If l.stderr == l.stdout, we use only one pipe to avoid 431 // duplicating everything. 432 cmd.Stderr = wOut 433 } else { 434 cmd.Stderr = wErr 435 wg.Add(1) 436 go func() { 437 defer wg.Done() 438 _, _ = io.Copy(l.stderr, io.TeeReader(rErr, debugStderrBuffer)) 439 }() 440 } 441 } 442 443 err := cmd.Run() 444 closePipes(ctx) 445 wg.Wait() 446 447 if err != nil { 448 // Context errors opaquely appear as "signal killed" when manifested. 449 // We surface this error explicitly. 450 if ctx.Err() != nil { 451 err = errors.CombineErrors(ctx.Err(), err) 452 } 453 454 if err != nil { 455 err = &withCommandDetails{ 456 cause: err, 457 cmd: strings.Join(args, " "), 458 stderr: debugStderrBuffer.String(), 459 stdout: debugStdoutBuffer.String(), 460 } 461 } 462 } 463 464 return cmdRes{ 465 err: err, 466 stdout: debugStdoutBuffer.String(), 467 stderr: debugStderrBuffer.String(), 468 } 469 } 470 471 type withCommandDetails struct { 472 cause error 473 cmd string 474 stderr string 475 stdout string 476 } 477 478 var _ error = (*withCommandDetails)(nil) 479 var _ errors.Formatter = (*withCommandDetails)(nil) 480 481 // Error implements error. 482 func (e *withCommandDetails) Error() string { return e.cause.Error() } 483 484 // Cause implements causer. 485 func (e *withCommandDetails) Cause() error { return e.cause } 486 487 // Format implements fmt.Formatter. 488 func (e *withCommandDetails) Format(s fmt.State, verb rune) { errors.FormatError(e, s, verb) } 489 490 // FormatError implements errors.Formatter. 491 func (e *withCommandDetails) FormatError(p errors.Printer) error { 492 p.Printf("%s returned", e.cmd) 493 if p.Detail() { 494 p.Printf("stderr:\n%s\nstdout:\n%s", e.stderr, e.stdout) 495 } 496 return e.cause 497 } 498 499 // GetStderr retrieves the stderr output of a command that 500 // returned with an error, or the empty string if there was no stderr. 501 func GetStderr(err error) string { 502 var c *withCommandDetails 503 if errors.As(err, &c) { 504 return c.stderr 505 } 506 return "" 507 } 508 509 // execCmdWithBuffer executes the given command and returns its stdout/stderr 510 // output. If the return code is not 0, an error is also returned. 511 // l is used to log the command before running it. No output is logged. 512 func execCmdWithBuffer(ctx context.Context, l *logger, args ...string) ([]byte, error) { 513 l.Printf("> %s\n", strings.Join(args, " ")) 514 cmd := exec.CommandContext(ctx, args[0], args[1:]...) 515 516 out, err := cmd.CombinedOutput() 517 if err != nil { 518 return out, errors.Wrapf(err, `%s`, strings.Join(args, ` `)) 519 } 520 return out, nil 521 } 522 523 func makeGCEClusterName(name string) string { 524 name = strings.ToLower(name) 525 name = regexp.MustCompile(`[^-a-z0-9]+`).ReplaceAllString(name, "-") 526 name = regexp.MustCompile(`-+`).ReplaceAllString(name, "-") 527 return name 528 } 529 530 func makeClusterName(name string) string { 531 return makeGCEClusterName(name) 532 } 533 534 // MachineTypeToCPUs returns a CPU count for either a GCE or AWS 535 // machine type. 536 func MachineTypeToCPUs(s string) int { 537 { 538 // GCE machine types. 539 var v int 540 if _, err := fmt.Sscanf(s, "n1-standard-%d", &v); err == nil { 541 return v 542 } 543 if _, err := fmt.Sscanf(s, "n1-highcpu-%d", &v); err == nil { 544 return v 545 } 546 if _, err := fmt.Sscanf(s, "n1-highmem-%d", &v); err == nil { 547 return v 548 } 549 } 550 551 typeAndSize := strings.Split(s, ".") 552 553 if len(typeAndSize) == 2 { 554 size := typeAndSize[1] 555 556 switch size { 557 case "large": 558 return 2 559 case "xlarge": 560 return 4 561 case "2xlarge": 562 return 8 563 case "4xlarge": 564 return 16 565 case "9xlarge": 566 return 36 567 case "12xlarge": 568 return 48 569 case "18xlarge": 570 return 72 571 case "24xlarge": 572 return 96 573 } 574 } 575 576 // Azure doesn't have a standard way to size machines. 577 // This method is implemented for the default machine type. 578 // Not all of Azure machine types contain the number of vCPUs int he size and 579 // the sizing naming scheme is dependent on the machine type family. 580 switch s { 581 case "Standard_D2_v3": 582 return 2 583 case "Standard_D4_v3": 584 return 4 585 case "Standard_D8_v3": 586 return 8 587 case "Standard_D16_v3": 588 return 16 589 case "Standard_D32_v3": 590 return 32 591 case "Standard_D48_v3": 592 return 48 593 case "Standard_D64_v3": 594 return 64 595 } 596 597 // TODO(pbardea): Non-default Azure machine types are not supported 598 // and will return unknown machine type error. 599 fmt.Fprintf(os.Stderr, "unknown machine type: %s\n", s) 600 os.Exit(1) 601 return -1 602 } 603 604 func awsMachineType(cpus int) string { 605 switch { 606 case cpus <= 2: 607 return "c5d.large" 608 case cpus <= 4: 609 return "c5d.xlarge" 610 case cpus <= 8: 611 return "c5d.2xlarge" 612 case cpus <= 16: 613 return "c5d.4xlarge" 614 case cpus <= 36: 615 return "c5d.9xlarge" 616 case cpus <= 72: 617 return "c5d.18xlarge" 618 case cpus <= 96: 619 // There is no c5d.24xlarge. 620 return "m5d.24xlarge" 621 default: 622 panic(fmt.Sprintf("no aws machine type with %d cpus", cpus)) 623 } 624 } 625 626 // Default GCE machine type when none is specified. 627 func gceMachineType(cpus int) string { 628 // TODO(peter): This is awkward: below 16 cpus, use n1-standard so that the 629 // machines have a decent amount of RAM. We could use customer machine 630 // configurations, but the rules for the amount of RAM per CPU need to be 631 // determined (you can't request any arbitrary amount of RAM). 632 if cpus < 16 { 633 return fmt.Sprintf("n1-standard-%d", cpus) 634 } 635 return fmt.Sprintf("n1-highcpu-%d", cpus) 636 } 637 638 func azureMachineType(cpus int) string { 639 switch { 640 case cpus <= 2: 641 return "Standard_D2_v3" 642 case cpus <= 4: 643 return "Standard_D4_v3" 644 case cpus <= 8: 645 return "Standard_D8_v3" 646 case cpus <= 16: 647 return "Standard_D16_v3" 648 case cpus <= 36: 649 return "Standard_D32_v3" 650 case cpus <= 48: 651 return "Standard_D48_v3" 652 case cpus <= 64: 653 return "Standard_D64_v3" 654 default: 655 panic(fmt.Sprintf("no azure machine type with %d cpus", cpus)) 656 } 657 } 658 659 func machineTypeFlag(machineType string) string { 660 switch cloud { 661 case aws: 662 if isSSD(machineType) { 663 return "--aws-machine-type-ssd" 664 } 665 return "--aws-machine-type" 666 case gce: 667 return "--gce-machine-type" 668 case azure: 669 return "--azure-machine-type" 670 default: 671 panic(fmt.Sprintf("unsupported cloud: %s\n", cloud)) 672 } 673 } 674 675 func isSSD(machineType string) bool { 676 if cloud != aws { 677 panic("can only differentiate SSDs based on machine type on AWS") 678 } 679 680 typeAndSize := strings.Split(machineType, ".") 681 if len(typeAndSize) == 2 { 682 awsType := typeAndSize[0] 683 // All SSD machine types that we use end in 'd or begins with i3 (e.g. i3, i3en). 684 return strings.HasPrefix(awsType, "i3") || strings.HasSuffix(awsType, "d") 685 } 686 687 fmt.Fprint(os.Stderr, "aws machine type does not match expected format 'type.size' (e.g. c5d.4xlarge)", machineType) 688 os.Exit(1) 689 return false 690 } 691 692 type testI interface { 693 Name() string 694 Fatal(args ...interface{}) 695 Fatalf(format string, args ...interface{}) 696 Failed() bool 697 // Path to a directory where the test is supposed to store its log and other 698 // artifacts. 699 ArtifactsDir() string 700 logger() *logger 701 } 702 703 // TODO(tschottdorf): Consider using a more idiomatic approach in which options 704 // act upon a config struct: 705 // https://dave.cheney.net/2014/10/17/functional-options-for-friendly-apis 706 type option interface { 707 option() 708 } 709 710 type nodeSelector interface { 711 option 712 merge(nodeListOption) nodeListOption 713 } 714 715 type nodeListOption []int 716 717 func (n nodeListOption) option() {} 718 719 func (n nodeListOption) merge(o nodeListOption) nodeListOption { 720 t := make(nodeListOption, 0, len(n)+len(o)) 721 t = append(t, n...) 722 t = append(t, o...) 723 sort.Ints([]int(t)) 724 r := t[:1] 725 for i := 1; i < len(t); i++ { 726 if r[len(r)-1] != t[i] { 727 r = append(r, t[i]) 728 } 729 } 730 return r 731 } 732 733 func (n nodeListOption) randNode() nodeListOption { 734 return nodeListOption{n[rand.Intn(len(n))]} 735 } 736 737 func (n nodeListOption) String() string { 738 if len(n) == 0 { 739 return "" 740 } 741 742 var buf bytes.Buffer 743 buf.WriteByte(':') 744 745 appendRange := func(start, end int) { 746 if buf.Len() > 1 { 747 buf.WriteByte(',') 748 } 749 if start == end { 750 fmt.Fprintf(&buf, "%d", start) 751 } else { 752 fmt.Fprintf(&buf, "%d-%d", start, end) 753 } 754 } 755 756 start, end := -1, -1 757 for _, i := range n { 758 if start != -1 && end == i-1 { 759 end = i 760 continue 761 } 762 if start != -1 { 763 appendRange(start, end) 764 } 765 start, end = i, i 766 } 767 if start != -1 { 768 appendRange(start, end) 769 } 770 return buf.String() 771 } 772 773 // clusterSpec represents a test's description of what its cluster needs to 774 // look like. It becomes part of a clusterConfig when the cluster is created. 775 type clusterSpec struct { 776 NodeCount int 777 // CPUs is the number of CPUs per node. 778 CPUs int 779 Zones string 780 Geo bool 781 Lifetime time.Duration 782 ReusePolicy clusterReusePolicy 783 } 784 785 func makeClusterSpec(nodeCount int, opts ...createOption) clusterSpec { 786 spec := clusterSpec{NodeCount: nodeCount} 787 defaultOpts := []createOption{cpu(4), nodeLifetimeOption(12 * time.Hour), reuseAny()} 788 for _, o := range append(defaultOpts, opts...) { 789 o.apply(&spec) 790 } 791 return spec 792 } 793 794 func clustersCompatible(s1, s2 clusterSpec) bool { 795 s1.Lifetime = 0 796 s2.Lifetime = 0 797 return s1 == s2 798 } 799 800 func (s clusterSpec) String() string { 801 str := fmt.Sprintf("n%dcpu%d", s.NodeCount, s.CPUs) 802 if s.Geo { 803 str += "-geo" 804 } 805 return str 806 } 807 808 func firstZone(zones string) string { 809 return strings.SplitN(zones, ",", 2)[0] 810 } 811 812 func (s *clusterSpec) args() []string { 813 var args []string 814 815 switch cloud { 816 case aws: 817 if s.Zones != "" { 818 fmt.Fprintf(os.Stderr, "zones spec not yet supported on AWS: %s\n", s.Zones) 819 os.Exit(1) 820 } 821 if s.Geo { 822 fmt.Fprintf(os.Stderr, "geo-distributed clusters not yet supported on AWS\n") 823 os.Exit(1) 824 } 825 826 args = append(args, "--clouds=aws") 827 case azure: 828 args = append(args, "--clouds=azure") 829 } 830 831 if !local && s.CPUs != 0 { 832 // Use the machine type specified as a CLI flag. 833 machineType := instanceType 834 if len(machineType) == 0 { 835 // If no machine type was specified, choose one 836 // based on the cloud and CPU count. 837 switch cloud { 838 case aws: 839 machineType = awsMachineType(s.CPUs) 840 case gce: 841 machineType = gceMachineType(s.CPUs) 842 case azure: 843 machineType = azureMachineType(s.CPUs) 844 } 845 } 846 if cloud == aws { 847 if isSSD(machineType) { 848 args = append(args, "--local-ssd=true") 849 } else { 850 args = append(args, "--local-ssd=false") 851 } 852 } 853 machineTypeArg := machineTypeFlag(machineType) + "=" + machineType 854 args = append(args, machineTypeArg) 855 } 856 if s.Zones != "" { 857 switch cloud { 858 case gce: 859 if s.Geo { 860 args = append(args, "--gce-zones="+s.Zones) 861 } else { 862 args = append(args, "--gce-zones="+firstZone(s.Zones)) 863 } 864 case azure: 865 args = append(args, "--azure-locations="+s.Zones) 866 default: 867 fmt.Fprintf(os.Stderr, "specifying zones is not yet supported on %s", cloud) 868 os.Exit(1) 869 } 870 } 871 if s.Geo { 872 args = append(args, "--geo") 873 } 874 if s.Lifetime != 0 { 875 args = append(args, "--lifetime="+s.Lifetime.String()) 876 } 877 return args 878 } 879 880 func (s *clusterSpec) expiration() time.Time { 881 l := s.Lifetime 882 if l == 0 { 883 l = 12 * time.Hour 884 } 885 return timeutil.Now().Add(l) 886 } 887 888 type createOption interface { 889 apply(spec *clusterSpec) 890 } 891 892 type nodeCPUOption int 893 894 func (o nodeCPUOption) apply(spec *clusterSpec) { 895 spec.CPUs = int(o) 896 } 897 898 // cpu is a node option which requests nodes with the specified number of CPUs. 899 func cpu(n int) nodeCPUOption { 900 return nodeCPUOption(n) 901 } 902 903 type nodeGeoOption struct{} 904 905 func (o nodeGeoOption) apply(spec *clusterSpec) { 906 spec.Geo = true 907 } 908 909 // geo is a node option which requests geo-distributed nodes. 910 func geo() nodeGeoOption { 911 return nodeGeoOption{} 912 } 913 914 type nodeZonesOption string 915 916 func (o nodeZonesOption) apply(spec *clusterSpec) { 917 spec.Zones = string(o) 918 } 919 920 // zones is a node option which requests geo-distributed nodes. Note that this 921 // overrides the --zones flag and is useful for tests that require running on 922 // specific zones. 923 func zones(s string) nodeZonesOption { 924 return nodeZonesOption(s) 925 } 926 927 type nodeLifetimeOption time.Duration 928 929 func (o nodeLifetimeOption) apply(spec *clusterSpec) { 930 spec.Lifetime = time.Duration(o) 931 } 932 933 // clusterReusePolicy indicates what clusters a particular test can run on and 934 // who (if anybody) can reuse the cluster after the test has finished running 935 // (either passing or failing). See the individual policies for details. 936 // 937 // Only tests whose cluster spec matches can ever run on the same 938 // cluster, regardless of this policy. 939 // 940 // Clean clusters (freshly-created clusters or cluster on which a test with the 941 // Any policy ran) are accepted by all policies. 942 // 943 // Note that not all combinations of "what cluster can I accept" and "how am I 944 // soiling this cluster" can be expressed. For example, there's no way to 945 // express that I'll accept a cluster that was tagged a certain way but after me 946 // nobody else can reuse the cluster at all. 947 type clusterReusePolicy interface { 948 clusterReusePolicy() 949 } 950 951 // reusePolicyAny means that only clean clusters are accepted and the cluster 952 // can be used by any other test (i.e. the cluster remains "clean"). 953 type reusePolicyAny struct{} 954 955 // reusePolicyNone means that only clean clusters are accepted and the cluster 956 // cannot be reused afterwards. 957 type reusePolicyNone struct{} 958 959 // reusePolicyTagged means that clusters left over by similarly-tagged tests are 960 // accepted in addition to clean cluster and, regardless of how the cluster 961 // started up, it will be tagged with the given tag at the end (so only 962 // similarly-tagged tests can use it afterwards). 963 // 964 // The idea is that a tag identifies a particular way in which a test is soiled, 965 // since it's common for groups of tests to mess clusters up in similar ways and 966 // to also be able to reset the cluster when the test starts. It's like a virus 967 // - if you carry it, you infect a clean host and can otherwise intermingle with 968 // other hosts that are already infected. Note that using this policy assumes 969 // that the way in which every test soils the cluster is idempotent. 970 type reusePolicyTagged struct{ tag string } 971 972 func (reusePolicyAny) clusterReusePolicy() {} 973 func (reusePolicyNone) clusterReusePolicy() {} 974 func (reusePolicyTagged) clusterReusePolicy() {} 975 976 type clusterReusePolicyOption struct { 977 p clusterReusePolicy 978 } 979 980 func reuseAny() clusterReusePolicyOption { 981 return clusterReusePolicyOption{p: reusePolicyAny{}} 982 } 983 func reuseNone() clusterReusePolicyOption { 984 return clusterReusePolicyOption{p: reusePolicyNone{}} 985 } 986 func reuseTagged(tag string) clusterReusePolicyOption { 987 return clusterReusePolicyOption{p: reusePolicyTagged{tag: tag}} 988 } 989 990 func (p clusterReusePolicyOption) apply(spec *clusterSpec) { 991 spec.ReusePolicy = p.p 992 } 993 994 // cluster provides an interface for interacting with a set of machines, 995 // starting and stopping a cockroach cluster on a subset of those machines, and 996 // running load generators and other operations on the machines. 997 // 998 // A cluster is safe for concurrent use by multiple goroutines. 999 type cluster struct { 1000 name string 1001 tag string 1002 spec clusterSpec 1003 // status is used to communicate the test's status. The callback is a noop 1004 // until the cluster is passed to a test, at which point it's hooked up to 1005 // test.Status(). 1006 status func(...interface{}) 1007 t testI 1008 // r is the registry tracking this cluster. Destroying the cluster will 1009 // unregister it. 1010 r *clusterRegistry 1011 // l is the logger used to log various cluster operations. 1012 // DEPRECATED for use outside of cluster methods: Use a test's t.l instead. 1013 // This is generally set to the current test's logger. 1014 l *logger 1015 expiration time.Time 1016 // encryptDefault is true if the cluster should default to having encryption 1017 // at rest enabled. The default only applies if encryption is not explicitly 1018 // enabled or disabled by options passed to Start. 1019 encryptDefault bool 1020 1021 // destroyState contains state related to the cluster's destruction. 1022 destroyState destroyState 1023 } 1024 1025 func (c *cluster) String() string { 1026 return fmt.Sprintf("%s [tag:%s] (%d nodes)", c.name, c.tag, c.spec.NodeCount) 1027 } 1028 1029 type destroyState struct { 1030 // owned is set if this instance is responsible for `roachprod destroy`ing the 1031 // cluster. It is set when a new cluster is created, but not when we attach to 1032 // an existing roachprod cluster. 1033 // If not set, Destroy() only wipes the cluster. 1034 owned bool 1035 1036 // alloc is set if owned is set. If set, it represents resources in a 1037 // QuotaPool that need to be released when the cluster is destroyed. 1038 alloc *quotapool.IntAlloc 1039 1040 mu struct { 1041 syncutil.Mutex 1042 loggerClosed bool 1043 // destroyed is used to coordinate between different goroutines that want to 1044 // destroy a cluster. It is set once the destroy process starts. It it 1045 // closed when the destruction is complete. 1046 destroyed chan struct{} 1047 // saved is set if this cluster should not be wiped or destroyed. It should 1048 // be left alone for further debugging. This is kept in sync with the 1049 // clusterRegistry which maintains a list of all saved clusters. 1050 saved bool 1051 // savedMsg records a message describing the reason why the cluster is being 1052 // saved. 1053 savedMsg string 1054 } 1055 } 1056 1057 // closeLogger closes c.l. It can be called multiple times. 1058 func (c *cluster) closeLogger() { 1059 c.destroyState.mu.Lock() 1060 defer c.destroyState.mu.Unlock() 1061 if c.destroyState.mu.loggerClosed { 1062 return 1063 } 1064 c.destroyState.mu.loggerClosed = true 1065 c.l.close() 1066 } 1067 1068 type clusterConfig struct { 1069 spec clusterSpec 1070 // artifactsDir is the path where log file will be stored. 1071 artifactsDir string 1072 localCluster bool 1073 useIOBarrier bool 1074 alloc *quotapool.IntAlloc 1075 } 1076 1077 // clusterFactory is a creator of clusters. 1078 type clusterFactory struct { 1079 // namePrefix is prepended to all cluster names. 1080 namePrefix string 1081 // counter is incremented with every new cluster. It's used as part of the cluster's name. 1082 // Accessed atomically. 1083 counter uint64 1084 // The registry with whom all clustered will be registered. 1085 r *clusterRegistry 1086 // artifactsDir is the directory in which the cluster creation log file will be placed. 1087 artifactsDir string 1088 // sem is a semaphore throttling the creation of clusters (because AWS has 1089 // ridiculous API calls limits). 1090 sem chan struct{} 1091 } 1092 1093 func newClusterFactory( 1094 user string, clustersID string, artifactsDir string, r *clusterRegistry, concurrentCreations int, 1095 ) *clusterFactory { 1096 secs := timeutil.Now().Unix() 1097 var prefix string 1098 if clustersID != "" { 1099 prefix = fmt.Sprintf("%s-%s-%d-", user, clustersID, secs) 1100 } else { 1101 prefix = fmt.Sprintf("%s-%d-", user, secs) 1102 } 1103 return &clusterFactory{ 1104 sem: make(chan struct{}, concurrentCreations), 1105 namePrefix: prefix, 1106 artifactsDir: artifactsDir, 1107 r: r, 1108 } 1109 } 1110 1111 // acquireSem blocks until the semaphore allows a new cluster creation. The 1112 // returned function needs to be called when cluster creation finished. 1113 func (f *clusterFactory) acquireSem() func() { 1114 f.sem <- struct{}{} 1115 return f.releaseSem 1116 } 1117 1118 func (f *clusterFactory) releaseSem() { 1119 <-f.sem 1120 } 1121 1122 // newCluster creates a new roachprod cluster. 1123 // 1124 // setStatus is called with status messages indicating the stage of cluster 1125 // creation. 1126 // 1127 // NOTE: setTest() needs to be called before a test can use this cluster. 1128 func (f *clusterFactory) newCluster( 1129 ctx context.Context, cfg clusterConfig, setStatus func(string), teeOpt teeOptType, 1130 ) (*cluster, error) { 1131 if ctx.Err() != nil { 1132 return nil, errors.Wrap(ctx.Err(), "newCluster") 1133 } 1134 1135 var name string 1136 if cfg.localCluster { 1137 name = "local" // The roachprod tool understands this magic name. 1138 } else { 1139 count := atomic.AddUint64(&f.counter, 1) 1140 name = makeClusterName( 1141 fmt.Sprintf("%s-%02d-%s", f.namePrefix, count, cfg.spec.String())) 1142 } 1143 1144 if cfg.spec.NodeCount == 0 { 1145 // For tests. Return the minimum that makes them happy. 1146 c := &cluster{ 1147 name: name, 1148 expiration: timeutil.Now().Add(24 * time.Hour), 1149 status: func(...interface{}) {}, 1150 r: f.r, 1151 } 1152 if err := f.r.registerCluster(c); err != nil { 1153 return nil, err 1154 } 1155 return c, nil 1156 } 1157 1158 exp := cfg.spec.expiration() 1159 if cfg.localCluster { 1160 // Local clusters never expire. 1161 exp = timeutil.Now().Add(100000 * time.Hour) 1162 } 1163 c := &cluster{ 1164 name: name, 1165 spec: cfg.spec, 1166 status: func(...interface{}) {}, 1167 expiration: exp, 1168 encryptDefault: encrypt.asBool(), 1169 r: f.r, 1170 destroyState: destroyState{ 1171 owned: true, 1172 alloc: cfg.alloc, 1173 }, 1174 } 1175 1176 sargs := []string{roachprod, "create", c.name, "-n", fmt.Sprint(c.spec.NodeCount)} 1177 sargs = append(sargs, cfg.spec.args()...) 1178 if !local && zonesF != "" && cfg.spec.Zones == "" { 1179 if cfg.spec.Geo { 1180 sargs = append(sargs, "--gce-zones="+zonesF) 1181 } else { 1182 sargs = append(sargs, "--gce-zones="+firstZone(zonesF)) 1183 } 1184 } 1185 if !cfg.useIOBarrier { 1186 sargs = append(sargs, "--local-ssd-no-ext4-barrier") 1187 } 1188 1189 setStatus("acquring cluster creation semaphore") 1190 release := f.acquireSem() 1191 defer release() 1192 setStatus("roachprod create") 1193 c.status("creating cluster") 1194 1195 // Logs for creating a new cluster go to a dedicated log file. 1196 logPath := filepath.Join(f.artifactsDir, runnerLogsDir, "cluster-create", name+".log") 1197 l, err := rootLogger(logPath, teeOpt) 1198 if err != nil { 1199 log.Fatalf(ctx, "%v", err) 1200 } 1201 1202 success := false 1203 // Attempt to create a cluster several times, cause them clouds be flaky that 1204 // my phone says it's snowing. 1205 for i := 0; i < 3; i++ { 1206 err = execCmd(ctx, l, sargs...) 1207 if err == nil { 1208 success = true 1209 break 1210 } 1211 l.PrintfCtx(ctx, "Failed to create cluster.") 1212 if !strings.Contains(GetStderr(err), "already exists") { 1213 l.PrintfCtx(ctx, "Cleaning up in case it was partially created.") 1214 c.Destroy(ctx, closeLogger, l) 1215 } else { 1216 break 1217 } 1218 } 1219 if !success { 1220 return nil, err 1221 } 1222 1223 if err := f.r.registerCluster(c); err != nil { 1224 return nil, err 1225 } 1226 1227 c.status("idle") 1228 return c, nil 1229 } 1230 1231 type attachOpt struct { 1232 skipValidation bool 1233 // Implies skipWipe. 1234 skipStop bool 1235 skipWipe bool 1236 } 1237 1238 // attachToExistingCluster creates a cluster object based on machines that have 1239 // already been already allocated by roachprod. 1240 // 1241 // NOTE: setTest() needs to be called before a test can use this cluster. 1242 func attachToExistingCluster( 1243 ctx context.Context, name string, l *logger, spec clusterSpec, opt attachOpt, r *clusterRegistry, 1244 ) (*cluster, error) { 1245 exp := spec.expiration() 1246 if name == "local" { 1247 exp = timeutil.Now().Add(100000 * time.Hour) 1248 } 1249 c := &cluster{ 1250 name: name, 1251 spec: spec, 1252 status: func(...interface{}) {}, 1253 l: l, 1254 expiration: exp, 1255 encryptDefault: encrypt.asBool(), 1256 destroyState: destroyState{ 1257 // If we're attaching to an existing cluster, we're not going to destoy it. 1258 owned: false, 1259 }, 1260 r: r, 1261 } 1262 1263 if err := r.registerCluster(c); err != nil { 1264 return nil, err 1265 } 1266 1267 if !opt.skipValidation { 1268 if err := c.validate(ctx, spec, l); err != nil { 1269 return nil, err 1270 } 1271 } 1272 1273 if !opt.skipStop { 1274 c.status("stopping cluster") 1275 if err := c.StopE(ctx, c.All()); err != nil { 1276 return nil, err 1277 } 1278 if !opt.skipWipe { 1279 if clusterWipe { 1280 if err := c.WipeE(ctx, l, c.All()); err != nil { 1281 return nil, err 1282 } 1283 } else { 1284 l.Printf("skipping cluster wipe\n") 1285 } 1286 } 1287 } 1288 1289 c.status("idle") 1290 return c, nil 1291 } 1292 1293 // setTest prepares c for being used on behalf of t. 1294 // 1295 // TODO(andrei): Get rid of c.t, c.l and of this method. 1296 func (c *cluster) setTest(t testI) { 1297 c.t = t 1298 c.l = t.logger() 1299 if impl, ok := t.(*test); ok { 1300 c.status = impl.Status 1301 } 1302 } 1303 1304 // StopCockroachGracefullyOnNode stops a running cockroach instance on the requested 1305 // node before a version upgrade. 1306 func (c *cluster) StopCockroachGracefullyOnNode(ctx context.Context, node int) error { 1307 port := fmt.Sprintf("{pgport:%d}", node) 1308 // Note that the following command line needs to run against both v2.1 1309 // and the current branch. Do not change it in a manner that is 1310 // incompatible with 2.1. 1311 if err := c.RunE(ctx, c.Node(node), "./cockroach quit --insecure --port="+port); err != nil { 1312 return err 1313 } 1314 // TODO (rohany): This comment below might be out of date. 1315 // NB: we still call Stop to make sure the process is dead when we try 1316 // to restart it (or we'll catch an error from the RocksDB dir being 1317 // locked). This won't happen unless run with --local due to timing. 1318 // However, it serves as a reminder that `./cockroach quit` doesn't yet 1319 // work well enough -- ideally all listeners and engines are closed by 1320 // the time it returns to the client. 1321 c.Stop(ctx, c.Node(node)) 1322 // TODO(tschottdorf): should return an error. I doubt that we want to 1323 // call these *testing.T-style methods on goroutines. 1324 return nil 1325 } 1326 1327 // Save marks the cluster as "saved" so that it doesn't get destroyed. 1328 func (c *cluster) Save(ctx context.Context, msg string, l *logger) { 1329 l.PrintfCtx(ctx, "saving cluster %s for debugging (--debug specified)", c) 1330 // TODO(andrei): should we extend the cluster here? For how long? 1331 if c.destroyState.owned { // we won't have an alloc for an unowned cluster 1332 c.destroyState.alloc.Freeze() 1333 } 1334 c.r.markClusterAsSaved(c, msg) 1335 c.destroyState.mu.Lock() 1336 c.destroyState.mu.saved = true 1337 c.destroyState.mu.savedMsg = msg 1338 c.destroyState.mu.Unlock() 1339 } 1340 1341 // validateCluster takes a cluster and checks that the reality corresponds to 1342 // the cluster's spec. It's intended to be used with clusters created by 1343 // attachToExistingCluster(); otherwise, clusters create with newCluster() are 1344 // know to be up to spec. 1345 func (c *cluster) validate(ctx context.Context, nodes clusterSpec, l *logger) error { 1346 // Perform validation on the existing cluster. 1347 c.status("checking that existing cluster matches spec") 1348 sargs := []string{roachprod, "list", c.name, "--json", "--quiet"} 1349 out, err := execCmdWithBuffer(ctx, l, sargs...) 1350 if err != nil { 1351 return err 1352 } 1353 1354 // jsonOutput matches the structure of the output from `roachprod list` 1355 // when in json mode. 1356 type jsonOutput struct { 1357 Clusters map[string]struct { 1358 VMs []struct { 1359 MachineType string `json:"machine_type"` 1360 } `json:"vms"` 1361 } `json:"clusters"` 1362 } 1363 var details jsonOutput 1364 if err := json.Unmarshal(out, &details); err != nil { 1365 return err 1366 } 1367 1368 cDetails, ok := details.Clusters[c.name] 1369 if !ok { 1370 return fmt.Errorf("cluster %q not found", c.name) 1371 } 1372 if len(cDetails.VMs) < c.spec.NodeCount { 1373 return fmt.Errorf("cluster has %d nodes, test requires at least %d", len(cDetails.VMs), c.spec.NodeCount) 1374 } 1375 if cpus := nodes.CPUs; cpus != 0 { 1376 for i, vm := range cDetails.VMs { 1377 vmCPUs := MachineTypeToCPUs(vm.MachineType) 1378 // vmCPUs will be negative if the machine type is unknown. Give unknown 1379 // machine types the benefit of the doubt. 1380 if vmCPUs > 0 && vmCPUs < cpus { 1381 return fmt.Errorf("node %d has %d CPUs, test requires %d", i, vmCPUs, cpus) 1382 } 1383 } 1384 } 1385 return nil 1386 } 1387 1388 // All returns a node list containing all of the nodes in the cluster. 1389 func (c *cluster) All() nodeListOption { 1390 return c.Range(1, c.spec.NodeCount) 1391 } 1392 1393 // All returns a node list containing the nodes [begin,end]. 1394 func (c *cluster) Range(begin, end int) nodeListOption { 1395 if begin < 1 || end > c.spec.NodeCount { 1396 c.t.Fatalf("invalid node range: %d-%d (1-%d)", begin, end, c.spec.NodeCount) 1397 } 1398 r := make(nodeListOption, 0, 1+end-begin) 1399 for i := begin; i <= end; i++ { 1400 r = append(r, i) 1401 } 1402 return r 1403 } 1404 1405 // All returns a node list containing only the node i. 1406 func (c *cluster) Node(i int) nodeListOption { 1407 return c.Range(i, i) 1408 } 1409 1410 // FetchLogs downloads the logs from the cluster using `roachprod get`. 1411 // The logs will be placed in the test's artifacts dir. 1412 func (c *cluster) FetchLogs(ctx context.Context) error { 1413 if c.spec.NodeCount == 0 { 1414 // No nodes can happen during unit tests and implies nothing to do. 1415 return nil 1416 } 1417 1418 c.l.Printf("fetching logs\n") 1419 c.status("fetching logs") 1420 1421 // Don't hang forever if we can't fetch the logs. 1422 return contextutil.RunWithTimeout(ctx, "fetch logs", 2*time.Minute, func(ctx context.Context) error { 1423 path := filepath.Join(c.t.ArtifactsDir(), "logs") 1424 if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { 1425 return err 1426 } 1427 1428 return execCmd(ctx, c.l, roachprod, "get", c.name, "logs" /* src */, path /* dest */) 1429 }) 1430 } 1431 1432 // CopyRoachprodState copies the roachprod state directory in to the test 1433 // artifacts. 1434 func (c *cluster) CopyRoachprodState(ctx context.Context) error { 1435 if c.spec.NodeCount == 0 { 1436 // No nodes can happen during unit tests and implies nothing to do. 1437 return nil 1438 } 1439 1440 const roachprodStateDirName = ".roachprod" 1441 const roachprodStateName = "roachprod_state" 1442 u, err := user.Current() 1443 if err != nil { 1444 return errors.Wrap(err, "failed to get current user") 1445 } 1446 src := filepath.Join(u.HomeDir, roachprodStateDirName) 1447 dest := filepath.Join(c.t.ArtifactsDir(), roachprodStateName) 1448 cmd := exec.CommandContext(ctx, "cp", "-r", src, dest) 1449 output, err := cmd.CombinedOutput() 1450 return errors.Wrapf(err, "command %q failed: output: %v", cmd.Args, string(output)) 1451 } 1452 1453 // FetchDebugZip downloads the debug zip from the cluster using `roachprod ssh`. 1454 // The logs will be placed in the test's artifacts dir. 1455 func (c *cluster) FetchDebugZip(ctx context.Context) error { 1456 if c.spec.NodeCount == 0 { 1457 // No nodes can happen during unit tests and implies nothing to do. 1458 return nil 1459 } 1460 1461 c.l.Printf("fetching debug zip\n") 1462 c.status("fetching debug zip") 1463 1464 // Don't hang forever if we can't fetch the debug zip. 1465 return contextutil.RunWithTimeout(ctx, "debug zip", 5*time.Minute, func(ctx context.Context) error { 1466 const zipName = "debug.zip" 1467 path := filepath.Join(c.t.ArtifactsDir(), zipName) 1468 if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { 1469 return err 1470 } 1471 // Some nodes might be down, so try to find one that works. We make the 1472 // assumption that a down node will refuse the connection, so it won't 1473 // waste our time. 1474 for i := 1; i <= c.spec.NodeCount; i++ { 1475 // `./cockroach debug zip` is noisy. Suppress the output unless it fails. 1476 si := strconv.Itoa(i) 1477 output, err := execCmdWithBuffer(ctx, c.l, roachprod, "ssh", c.name+":"+si, "--", 1478 "./cockroach", "debug", "zip", "--url", "{pgurl:"+si+"}", zipName) 1479 if err != nil { 1480 c.l.Printf("./cockroach debug zip failed: %s", output) 1481 if i < c.spec.NodeCount { 1482 continue 1483 } 1484 return err 1485 } 1486 return execCmd(ctx, c.l, roachprod, "get", c.name+":"+si, zipName /* src */, path /* dest */) 1487 } 1488 return nil 1489 }) 1490 } 1491 1492 // FailOnDeadNodes fails the test if nodes that have a populated data dir are 1493 // found to be not running. It prints both to t.l and the test output. 1494 func (c *cluster) FailOnDeadNodes(ctx context.Context, t *test) { 1495 if c.spec.NodeCount == 0 { 1496 // No nodes can happen during unit tests and implies nothing to do. 1497 return 1498 } 1499 1500 // Don't hang forever. 1501 _ = contextutil.RunWithTimeout(ctx, "detect dead nodes", time.Minute, func(ctx context.Context) error { 1502 output, err := execCmdWithBuffer( 1503 ctx, t.l, roachprod, "monitor", c.name, "--oneshot", "--ignore-empty-nodes", 1504 ) 1505 // If there's an error, it means either that the monitor command failed 1506 // completely, or that it found a dead node worth complaining about. 1507 if err != nil { 1508 if ctx.Err() != nil { 1509 // Don't fail if we timed out. 1510 return nil 1511 } 1512 t.printfAndFail(0 /* skip */, "dead node detection: %s %s", err, output) 1513 } 1514 return nil 1515 }) 1516 } 1517 1518 // CheckReplicaDivergenceOnDB runs a fast consistency check of the whole keyspace 1519 // against the provided db. If an inconsistency is found, it returns it in the 1520 // error. Note that this will swallow errors returned directly from the consistency 1521 // check since we know that such spurious errors are possibly without any relation 1522 // to the check having failed. 1523 func (c *cluster) CheckReplicaDivergenceOnDB(ctx context.Context, db *gosql.DB) error { 1524 // NB: we set a statement_timeout since context cancellation won't work here, 1525 // see: 1526 // https://github.com/cockroachdb/cockroach/pull/34520 1527 // 1528 // We've seen the consistency checks hang indefinitely in some cases. 1529 rows, err := db.QueryContext(ctx, ` 1530 SET statement_timeout = '3m'; 1531 SELECT t.range_id, t.start_key_pretty, t.status, t.detail 1532 FROM 1533 crdb_internal.check_consistency(true, '', '') as t 1534 WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`) 1535 if err != nil { 1536 // TODO(tbg): the checks can fail for silly reasons like missing gossiped 1537 // descriptors, etc. -- not worth failing the test for. Ideally this would 1538 // be rock solid. 1539 c.l.Printf("consistency check failed with %v; ignoring", err) 1540 return nil 1541 } 1542 var finalErr error 1543 for rows.Next() { 1544 var rangeID int32 1545 var prettyKey, status, detail string 1546 if scanErr := rows.Scan(&rangeID, &prettyKey, &status, &detail); err != nil { 1547 return scanErr 1548 } 1549 finalErr = errors.CombineErrors(finalErr, 1550 errors.Newf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail)) 1551 } 1552 if err := rows.Err(); err != nil { 1553 finalErr = errors.CombineErrors(finalErr, err) 1554 } 1555 1556 return finalErr 1557 } 1558 1559 // FailOnReplicaDivergence fails the test if 1560 // crdb_internal.check_consistency(true, '', '') indicates that any ranges' 1561 // replicas are inconsistent with each other. It uses the first node that 1562 // is up to run the query. 1563 func (c *cluster) FailOnReplicaDivergence(ctx context.Context, t *test) { 1564 if c.spec.NodeCount < 1 { 1565 return // unit tests 1566 } 1567 1568 // Find a live node to run against, if one exists. 1569 var db *gosql.DB 1570 for i := 1; i <= c.spec.NodeCount; i++ { 1571 // Don't hang forever. 1572 if err := contextutil.RunWithTimeout( 1573 ctx, "find live node", 5*time.Second, 1574 func(ctx context.Context) error { 1575 db = c.Conn(ctx, i) 1576 _, err := db.ExecContext(ctx, `;`) 1577 return err 1578 }, 1579 ); err != nil { 1580 _ = db.Close() 1581 db = nil 1582 continue 1583 } 1584 c.l.Printf("running (fast) consistency checks on node %d", i) 1585 break 1586 } 1587 if db == nil { 1588 c.l.Printf("no live node found, skipping consistency check") 1589 return 1590 } 1591 defer db.Close() 1592 1593 if err := contextutil.RunWithTimeout( 1594 ctx, "consistency check", time.Minute, 1595 func(ctx context.Context) error { 1596 return c.CheckReplicaDivergenceOnDB(ctx, db) 1597 }, 1598 ); err != nil { 1599 t.Fatal(err) 1600 } 1601 } 1602 1603 // FetchDmesg grabs the dmesg logs if possible. This requires being able to run 1604 // `sudo dmesg` on the remote nodes. 1605 func (c *cluster) FetchDmesg(ctx context.Context) error { 1606 if c.spec.NodeCount == 0 || c.isLocal() { 1607 // No nodes can happen during unit tests and implies nothing to do. 1608 // Also, don't grab dmesg on local runs. 1609 return nil 1610 } 1611 1612 c.l.Printf("fetching dmesg\n") 1613 c.status("fetching dmesg") 1614 1615 // Don't hang forever. 1616 return contextutil.RunWithTimeout(ctx, "dmesg", 20*time.Second, func(ctx context.Context) error { 1617 const name = "dmesg.txt" 1618 path := filepath.Join(c.t.ArtifactsDir(), name) 1619 if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { 1620 return err 1621 } 1622 if err := execCmd( 1623 ctx, c.l, roachprod, "ssh", c.name, "--", 1624 "/bin/bash", "-c", "'sudo dmesg > "+name+"'", /* src */ 1625 ); err != nil { 1626 // Don't error out because it might've worked on some nodes. Fetching will 1627 // error out below but will get everything it can first. 1628 c.l.Printf("during dmesg fetching: %s", err) 1629 } 1630 return execCmd(ctx, c.l, roachprod, "get", c.name, name /* src */, path /* dest */) 1631 }) 1632 } 1633 1634 // FetchJournalctl grabs the journalctl logs if possible. This requires being 1635 // able to run `sudo journalctl` on the remote nodes. 1636 func (c *cluster) FetchJournalctl(ctx context.Context) error { 1637 if c.spec.NodeCount == 0 || c.isLocal() { 1638 // No nodes can happen during unit tests and implies nothing to do. 1639 // Also, don't grab journalctl on local runs. 1640 return nil 1641 } 1642 1643 c.l.Printf("fetching journalctl\n") 1644 c.status("fetching journalctl") 1645 1646 // Don't hang forever. 1647 return contextutil.RunWithTimeout(ctx, "journalctl", 20*time.Second, func(ctx context.Context) error { 1648 const name = "journalctl.txt" 1649 path := filepath.Join(c.t.ArtifactsDir(), name) 1650 if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { 1651 return err 1652 } 1653 if err := execCmd( 1654 ctx, c.l, roachprod, "ssh", c.name, "--", 1655 "/bin/bash", "-c", "'sudo journalctl > "+name+"'", /* src */ 1656 ); err != nil { 1657 // Don't error out because it might've worked on some nodes. Fetching will 1658 // error out below but will get everything it can first. 1659 c.l.Printf("during journalctl fetching: %s", err) 1660 } 1661 return execCmd(ctx, c.l, roachprod, "get", c.name, name /* src */, path /* dest */) 1662 }) 1663 } 1664 1665 // FetchCores fetches any core files on the cluster. 1666 func (c *cluster) FetchCores(ctx context.Context) error { 1667 if c.spec.NodeCount == 0 || c.isLocal() { 1668 // No nodes can happen during unit tests and implies nothing to do. 1669 // Also, don't grab dmesg on local runs. 1670 return nil 1671 } 1672 1673 if true { 1674 // TeamCity does not handle giant artifacts well. We'd generally profit 1675 // from having the cores, but we should push them straight into a temp 1676 // bucket on S3 instead. OTOH, the ROI of this may be low; I don't know 1677 // of a recent example where we've wanted the Core dumps. 1678 c.l.Printf("skipped fetching cores\n") 1679 return nil 1680 } 1681 1682 c.l.Printf("fetching cores\n") 1683 c.status("fetching cores") 1684 1685 // Don't hang forever. The core files can be large, so we give a generous 1686 // timeout. 1687 return contextutil.RunWithTimeout(ctx, "cores", 60*time.Second, func(ctx context.Context) error { 1688 path := filepath.Join(c.t.ArtifactsDir(), "cores") 1689 return execCmd(ctx, c.l, roachprod, "get", c.name, "/mnt/data1/cores" /* src */, path /* dest */) 1690 }) 1691 } 1692 1693 type closeLoggerOpt bool 1694 1695 const ( 1696 closeLogger closeLoggerOpt = true 1697 dontCloseLogger = false 1698 ) 1699 1700 // Destroy calls `roachprod destroy` or `roachprod wipe` on the cluster. 1701 // If called while another Destroy() or destroyInner() is in progress, the call 1702 // blocks until that first call finishes. 1703 // 1704 // If c.Save() had previously been called, then Destroy() will not actually 1705 // touch the cluster. It might still close c.l, though. 1706 // 1707 // Cluster destruction errors are swallowed. 1708 // 1709 // lo specifies if c.l should be closed or not. If c.l may still be in use by a 1710 // test (i.e. if this Destroy is happening because of a timeout or a signal), 1711 // then we don't want to close the logger. 1712 // l is the logger that will log this destroy operation. 1713 // 1714 // This method generally does not react to ctx cancelation. 1715 func (c *cluster) Destroy(ctx context.Context, lo closeLoggerOpt, l *logger) { 1716 if ctx.Err() != nil { 1717 return 1718 } 1719 if c.spec.NodeCount == 0 { 1720 // No nodes can happen during unit tests and implies not much to do. 1721 c.r.unregisterCluster(c) 1722 return 1723 } 1724 1725 ch := c.doDestroy(ctx, l) 1726 <-ch 1727 // NB: Closing the logger without waiting on c.destroyState.destroyed above 1728 // would be bad because we might cause the ongoing `roachprod destroy` to fail 1729 // by closing its stdout/stderr. 1730 if lo == closeLogger && c.l != nil { 1731 c.closeLogger() 1732 } 1733 } 1734 1735 func (c *cluster) doDestroy(ctx context.Context, l *logger) <-chan struct{} { 1736 var inFlight <-chan struct{} 1737 c.destroyState.mu.Lock() 1738 if c.destroyState.mu.saved { 1739 // Nothing to do. Short-circuit. 1740 c.destroyState.mu.Unlock() 1741 ch := make(chan struct{}) 1742 close(ch) 1743 return ch 1744 } 1745 if c.destroyState.mu.destroyed == nil { 1746 c.destroyState.mu.destroyed = make(chan struct{}) 1747 } else { 1748 inFlight = c.destroyState.mu.destroyed 1749 } 1750 c.destroyState.mu.Unlock() 1751 if inFlight != nil { 1752 return inFlight 1753 } 1754 1755 if clusterWipe { 1756 if c.destroyState.owned { 1757 l.PrintfCtx(ctx, "destroying cluster %s...", c) 1758 c.status("destroying cluster") 1759 // We use a non-cancelable context for running this command. Once we got 1760 // here, the cluster cannot be destroyed again, so we really want this 1761 // command to succeed. 1762 if err := execCmd(context.Background(), l, roachprod, "destroy", c.name); err != nil { 1763 l.ErrorfCtx(ctx, "error destroying cluster %s: %s", c, err) 1764 } else { 1765 l.PrintfCtx(ctx, "destroying cluster %s... done", c) 1766 } 1767 c.destroyState.alloc.Release() 1768 } else { 1769 l.PrintfCtx(ctx, "wiping cluster %s", c) 1770 c.status("wiping cluster") 1771 if err := execCmd(ctx, l, roachprod, "wipe", c.name); err != nil { 1772 l.Errorf("%s", err) 1773 } 1774 } 1775 } else { 1776 l.Printf("skipping cluster wipe\n") 1777 } 1778 c.r.unregisterCluster(c) 1779 c.destroyState.mu.Lock() 1780 ch := c.destroyState.mu.destroyed 1781 close(ch) 1782 c.destroyState.mu.Unlock() 1783 return ch 1784 } 1785 1786 // Run a command with output redirected to the logs instead of to os.Stdout 1787 // (which doesn't go anywhere I've been able to find) Don't use this if you're 1788 // going to call cmd.CombinedOutput or cmd.Output. 1789 func (c *cluster) LoggedCommand(ctx context.Context, arg0 string, args ...string) *exec.Cmd { 1790 cmd := exec.CommandContext(ctx, arg0, args...) 1791 cmd.Stdout = c.l.stdout 1792 cmd.Stderr = c.l.stderr 1793 return cmd 1794 } 1795 1796 // Put a local file to all of the machines in a cluster. 1797 // Put is DEPRECATED. Use PutE instead. 1798 func (c *cluster) Put(ctx context.Context, src, dest string, opts ...option) { 1799 if err := c.PutE(ctx, c.l, src, dest, opts...); err != nil { 1800 c.t.Fatal(err) 1801 } 1802 } 1803 1804 // PutE puts a local file to all of the machines in a cluster. 1805 func (c *cluster) PutE(ctx context.Context, l *logger, src, dest string, opts ...option) error { 1806 if ctx.Err() != nil { 1807 return errors.Wrap(ctx.Err(), "cluster.Put") 1808 } 1809 1810 c.status("uploading binary") 1811 defer c.status("") 1812 1813 err := execCmd(ctx, c.l, roachprod, "put", c.makeNodes(opts...), src, dest) 1814 if err != nil { 1815 return errors.Wrap(err, "cluster.Put") 1816 } 1817 return nil 1818 } 1819 1820 // Get gets files from remote hosts. 1821 func (c *cluster) Get(ctx context.Context, l *logger, src, dest string, opts ...option) error { 1822 if ctx.Err() != nil { 1823 return errors.Wrap(ctx.Err(), "cluster.Get error") 1824 } 1825 c.status(fmt.Sprintf("getting %v", src)) 1826 defer c.status("") 1827 return errors.Wrap( 1828 execCmd(ctx, l, roachprod, "get", c.makeNodes(opts...), src, dest), 1829 "cluster.Get error") 1830 1831 } 1832 1833 // Put a string into the specified file on the remote(s). 1834 func (c *cluster) PutString( 1835 ctx context.Context, content, dest string, mode os.FileMode, opts ...option, 1836 ) error { 1837 if ctx.Err() != nil { 1838 return errors.Wrap(ctx.Err(), "cluster.PutString error") 1839 } 1840 c.status("uploading string") 1841 defer c.status("") 1842 1843 temp, err := ioutil.TempFile("", filepath.Base(dest)) 1844 if err != nil { 1845 return errors.Wrap(err, "PutString") 1846 } 1847 if _, err := temp.WriteString(content); err != nil { 1848 return errors.Wrap(err, "PutString") 1849 } 1850 temp.Close() 1851 src := temp.Name() 1852 1853 if err := os.Chmod(src, mode); err != nil { 1854 return errors.Wrap(err, "PutString") 1855 } 1856 // NB: we intentionally don't remove the temp files. This is because roachprod 1857 // will symlink them when running locally. 1858 1859 if err := execCmd(ctx, c.l, roachprod, "put", c.makeNodes(opts...), src, dest); err != nil { 1860 return errors.Wrap(err, "PutString") 1861 } 1862 return nil 1863 } 1864 1865 // GitClone clones a git repo from src into dest and checks out origin's 1866 // version of the given branch. The src, dest, and branch arguments must not 1867 // contain shell special characters. 1868 func (c *cluster) GitClone( 1869 ctx context.Context, l *logger, src, dest, branch string, node nodeListOption, 1870 ) error { 1871 return c.RunL(ctx, l, node, "bash", "-e", "-c", fmt.Sprintf(`' 1872 if ! test -d %s; then 1873 git clone -b %s --depth 1 %s %s 1874 else 1875 cd %s 1876 git fetch origin 1877 git checkout origin/%s 1878 fi 1879 '`, dest, 1880 branch, src, dest, 1881 dest, 1882 branch)) 1883 } 1884 1885 // startArgs specifies extra arguments that are passed to `roachprod` during `c.Start`. 1886 func startArgs(extraArgs ...string) option { 1887 return roachprodArgOption(extraArgs) 1888 } 1889 1890 // startArgsDontEncrypt will pass '--encrypt=false' to roachprod regardless of the 1891 // --encrypt flag on roachtest. This is useful for tests that cannot pass with 1892 // encryption enabled. 1893 var startArgsDontEncrypt = startArgs("--encrypt=false") 1894 1895 // racks is an option which specifies the number of racks to partition the nodes 1896 // into. 1897 func racks(n int) option { 1898 return startArgs(fmt.Sprintf("--racks=%d", n)) 1899 } 1900 1901 // stopArgs specifies extra arguments that are passed to `roachprod` during `c.Stop`. 1902 func stopArgs(extraArgs ...string) option { 1903 return roachprodArgOption(extraArgs) 1904 } 1905 1906 type roachprodArgOption []string 1907 1908 func (o roachprodArgOption) option() {} 1909 1910 func roachprodArgs(opts []option) []string { 1911 var args []string 1912 for _, opt := range opts { 1913 a, ok := opt.(roachprodArgOption) 1914 if !ok { 1915 continue 1916 } 1917 args = append(args, ([]string)(a)...) 1918 } 1919 return args 1920 } 1921 1922 // Restart restarts the specified cockroach node. It takes a test and, on error, 1923 // calls t.Fatal(). 1924 func (c *cluster) Restart(ctx context.Context, t *test, node nodeListOption) { 1925 // We bound the time taken to restart a node through roachprod. Because 1926 // roachprod uses SSH, it's particularly vulnerable to network flakiness (as 1927 // seen in #35326) and may stall indefinitely. Setting up timeouts better 1928 // surfaces this kind of failure. 1929 // 1930 // TODO(irfansharif): The underlying issue here is the fact that we're running 1931 // roachprod commands that may (reasonably) fail due to connection issues, and 1932 // we're unable to retry them safely (the underlying commands are 1933 // non-idempotent). Presently we simply fail the entire test, when really we 1934 // should be able to retry the specific roachprod commands. 1935 var cancel func() 1936 ctx, cancel = context.WithTimeout(ctx, 30*time.Second) 1937 c.Stop(ctx, node) 1938 c.Start(ctx, t, node) 1939 cancel() 1940 } 1941 1942 // StartE starts cockroach nodes on a subset of the cluster. The nodes parameter 1943 // can either be a specific node, empty (to indicate all nodes), or a pair of 1944 // nodes indicating a range. 1945 func (c *cluster) StartE(ctx context.Context, opts ...option) error { 1946 if ctx.Err() != nil { 1947 return errors.Wrap(ctx.Err(), "cluster.StartE") 1948 } 1949 // If the test failed (indicated by a canceled ctx), short-circuit. 1950 if ctx.Err() != nil { 1951 return ctx.Err() 1952 } 1953 c.status("starting cluster") 1954 defer c.status() 1955 args := []string{ 1956 roachprod, 1957 "start", 1958 } 1959 args = append(args, roachprodArgs(opts)...) 1960 args = append(args, c.makeNodes(opts...)) 1961 if !argExists(args, "--encrypt") && c.encryptDefault { 1962 args = append(args, "--encrypt") 1963 } 1964 return execCmd(ctx, c.l, args...) 1965 } 1966 1967 // Start is like StartE() except it takes a test and, on error, calls t.Fatal(). 1968 func (c *cluster) Start(ctx context.Context, t *test, opts ...option) { 1969 FatalIfErr(t, c.StartE(ctx, opts...)) 1970 } 1971 1972 func argExists(args []string, target string) bool { 1973 for _, arg := range args { 1974 if arg == target || strings.HasPrefix(arg, target+"=") { 1975 return true 1976 } 1977 } 1978 return false 1979 } 1980 1981 // StopE cockroach nodes running on a subset of the cluster. See cluster.Start() 1982 // for a description of the nodes parameter. 1983 func (c *cluster) StopE(ctx context.Context, opts ...option) error { 1984 if ctx.Err() != nil { 1985 return errors.Wrap(ctx.Err(), "cluster.StopE") 1986 } 1987 args := []string{ 1988 roachprod, 1989 "stop", 1990 } 1991 args = append(args, roachprodArgs(opts)...) 1992 args = append(args, c.makeNodes(opts...)) 1993 c.status("stopping cluster") 1994 defer c.status() 1995 return execCmd(ctx, c.l, args...) 1996 } 1997 1998 // Stop is like StopE, except instead of returning an error, it does 1999 // c.t.Fatal(). c.t needs to be set. 2000 func (c *cluster) Stop(ctx context.Context, opts ...option) { 2001 if c.t.Failed() { 2002 // If the test has failed, don't try to limp along. 2003 return 2004 } 2005 if err := c.StopE(ctx, opts...); err != nil { 2006 c.t.Fatal(err) 2007 } 2008 } 2009 2010 // WipeE wipes a subset of the nodes in a cluster. See cluster.Start() for a 2011 // description of the nodes parameter. 2012 func (c *cluster) WipeE(ctx context.Context, l *logger, opts ...option) error { 2013 if ctx.Err() != nil { 2014 return errors.Wrap(ctx.Err(), "cluster.WipeE") 2015 } 2016 if c.spec.NodeCount == 0 { 2017 // For tests. 2018 return nil 2019 } 2020 c.status("wiping cluster") 2021 defer c.status() 2022 return execCmd(ctx, l, roachprod, "wipe", c.makeNodes(opts...)) 2023 } 2024 2025 // Wipe is like WipeE, except instead of returning an error, it does 2026 // c.t.Fatal(). c.t needs to be set. 2027 func (c *cluster) Wipe(ctx context.Context, opts ...option) { 2028 if ctx.Err() != nil { 2029 return 2030 } 2031 if err := c.WipeE(ctx, c.l, opts...); err != nil { 2032 c.t.Fatal(err) 2033 } 2034 } 2035 2036 // Run a command on the specified node. 2037 func (c *cluster) Run(ctx context.Context, node nodeListOption, args ...string) { 2038 err := c.RunE(ctx, node, args...) 2039 if err != nil { 2040 c.t.Fatal(err) 2041 } 2042 } 2043 2044 // Reformat the disk on the specified node. 2045 func (c *cluster) Reformat(ctx context.Context, node nodeListOption, args ...string) { 2046 err := execCmd(ctx, c.l, 2047 append([]string{roachprod, "reformat", c.makeNodes(node), "--"}, args...)...) 2048 if err != nil { 2049 c.t.Fatal(err) 2050 } 2051 } 2052 2053 // Silence unused warning. 2054 var _ = (&cluster{}).Reformat 2055 2056 // Install a package in a node 2057 func (c *cluster) Install( 2058 ctx context.Context, l *logger, node nodeListOption, args ...string, 2059 ) error { 2060 return execCmd(ctx, l, 2061 append([]string{roachprod, "install", c.makeNodes(node), "--"}, args...)...) 2062 } 2063 2064 var reOnlyAlphanumeric = regexp.MustCompile(`[^a-zA-Z0-9]+`) 2065 2066 // cmdLogFileName comes up with a log file to use for the given argument string. 2067 func cmdLogFileName(t time.Time, nodes nodeListOption, args ...string) string { 2068 // Make sure we treat {"./cockroach start"} like {"./cockroach", "start"}. 2069 args = strings.Split(strings.Join(args, " "), " ") 2070 prefix := []string{reOnlyAlphanumeric.ReplaceAllString(args[0], "")} 2071 for _, arg := range args[1:] { 2072 if s := reOnlyAlphanumeric.ReplaceAllString(arg, ""); s != arg { 2073 break 2074 } 2075 prefix = append(prefix, arg) 2076 } 2077 s := strings.Join(prefix, "_") 2078 const maxLen = 70 2079 if len(s) > maxLen { 2080 s = s[:maxLen] 2081 } 2082 logFile := fmt.Sprintf( 2083 "run_%s_n%s_%s", 2084 t.Format(`150405.000`), 2085 nodes.String()[1:], 2086 s, 2087 ) 2088 return logFile 2089 } 2090 2091 // RunE runs a command on the specified node, returning an error. The output 2092 // will be redirected to a file which is logged via the cluster-wide logger in 2093 // case of an error. Logs will sort chronologically and those belonging to 2094 // failing invocations will be suffixed `.failed.log`. 2095 func (c *cluster) RunE(ctx context.Context, node nodeListOption, args ...string) error { 2096 cmdString := strings.Join(args, " ") 2097 logFile := cmdLogFileName(timeutil.Now(), node, args...) 2098 2099 // NB: we set no prefix because it's only going to a file anyway. 2100 l, err := c.l.ChildLogger(logFile, quietStderr, quietStdout) 2101 if err != nil { 2102 return err 2103 } 2104 c.l.PrintfCtx(ctx, "> %s", cmdString) 2105 err = c.RunL(ctx, l, node, args...) 2106 l.Printf("> result: %+v", err) 2107 if err := ctx.Err(); err != nil { 2108 l.Printf("(note: incoming context was canceled: %s", err) 2109 } 2110 physicalFileName := l.file.Name() 2111 l.close() 2112 if err != nil { 2113 _ = os.Rename(physicalFileName, strings.TrimSuffix(physicalFileName, ".log")+".failed.log") 2114 } 2115 err = errors.Wrapf(err, "output in %s", logFile) 2116 return err 2117 } 2118 2119 // RunL runs a command on the specified node, returning an error. 2120 func (c *cluster) RunL(ctx context.Context, l *logger, node nodeListOption, args ...string) error { 2121 if err := errors.Wrap(ctx.Err(), "cluster.RunL"); err != nil { 2122 return err 2123 } 2124 return execCmd(ctx, l, 2125 append([]string{roachprod, "run", c.makeNodes(node), "--"}, args...)...) 2126 } 2127 2128 // RunWithBuffer runs a command on the specified node, returning the resulting combined stderr 2129 // and stdout or an error. 2130 func (c *cluster) RunWithBuffer( 2131 ctx context.Context, l *logger, node nodeListOption, args ...string, 2132 ) ([]byte, error) { 2133 if err := errors.Wrap(ctx.Err(), "cluster.RunWithBuffer"); err != nil { 2134 return nil, err 2135 } 2136 return execCmdWithBuffer(ctx, l, 2137 append([]string{roachprod, "run", c.makeNodes(node), "--"}, args...)...) 2138 } 2139 2140 // pgURL returns the Postgres endpoint for the specified node. It accepts a flag 2141 // specifying whether the URL should include the node's internal or external IP 2142 // address. In general, inter-cluster communication and should use internal IPs 2143 // and communication from a test driver to nodes in a cluster should use 2144 // external IPs. 2145 func (c *cluster) pgURL(ctx context.Context, node nodeListOption, external bool) []string { 2146 args := []string{roachprod, "pgurl"} 2147 if external { 2148 args = append(args, `--external`) 2149 } 2150 nodes := c.makeNodes(node) 2151 args = append(args, nodes) 2152 cmd := execCmdEx(ctx, c.l, args...) 2153 if cmd.err != nil { 2154 c.t.Fatal(errors.Wrapf(cmd.err, "failed to get pgurl for nodes: %s", nodes)) 2155 } 2156 urls := strings.Split(strings.TrimSpace(cmd.stdout), " ") 2157 if len(urls) != len(node) { 2158 c.t.Fatalf( 2159 "pgurl for nodes %v got urls %v from stdout:\n%s\nstderr:\n%s", 2160 node, urls, cmd.stdout, cmd.stderr, 2161 ) 2162 } 2163 for i := range urls { 2164 urls[i] = strings.Trim(urls[i], "'") 2165 if urls[i] == "" { 2166 c.t.Fatalf( 2167 "pgurl for nodes %s empty: %v from\nstdout:\n%s\nstderr:\n%s", 2168 urls, node, cmd.stdout, cmd.stderr, 2169 ) 2170 } 2171 } 2172 return urls 2173 } 2174 2175 // InternalPGUrl returns the internal Postgres endpoint for the specified nodes. 2176 func (c *cluster) InternalPGUrl(ctx context.Context, node nodeListOption) []string { 2177 return c.pgURL(ctx, node, false /* external */) 2178 } 2179 2180 // Silence unused warning. 2181 var _ = (&cluster{}).InternalPGUrl 2182 2183 // ExternalPGUrl returns the external Postgres endpoint for the specified nodes. 2184 func (c *cluster) ExternalPGUrl(ctx context.Context, node nodeListOption) []string { 2185 return c.pgURL(ctx, node, true /* external */) 2186 } 2187 2188 func addrToAdminUIAddr(c *cluster, addr string) string { 2189 host, port, err := net.SplitHostPort(addr) 2190 if err != nil { 2191 c.t.Fatal(err) 2192 } 2193 webPort, err := strconv.Atoi(port) 2194 if err != nil { 2195 c.t.Fatal(err) 2196 } 2197 // Roachprod makes Admin UI's port to be node's port + 1. 2198 return fmt.Sprintf("%s:%d", host, webPort+1) 2199 } 2200 2201 func urlToAddr(c *cluster, pgURL string) string { 2202 u, err := url.Parse(pgURL) 2203 if err != nil { 2204 c.t.Fatal(err) 2205 } 2206 return u.Host 2207 } 2208 2209 func addrToHost(c *cluster, addr string) string { 2210 host, _ := addrToHostPort(c, addr) 2211 return host 2212 } 2213 2214 func addrToHostPort(c *cluster, addr string) (string, int) { 2215 host, portStr, err := net.SplitHostPort(addr) 2216 if err != nil { 2217 c.t.Fatal(err) 2218 } 2219 port, err := strconv.Atoi(portStr) 2220 if err != nil { 2221 c.t.Fatal(err) 2222 } 2223 return host, port 2224 } 2225 2226 // InternalAdminUIAddr returns the internal Admin UI address in the form host:port 2227 // for the specified node. 2228 func (c *cluster) InternalAdminUIAddr(ctx context.Context, node nodeListOption) []string { 2229 var addrs []string 2230 for _, u := range c.InternalAddr(ctx, node) { 2231 addrs = append(addrs, addrToAdminUIAddr(c, u)) 2232 } 2233 return addrs 2234 } 2235 2236 // ExternalAdminUIAddr returns the internal Admin UI address in the form host:port 2237 // for the specified node. 2238 func (c *cluster) ExternalAdminUIAddr(ctx context.Context, node nodeListOption) []string { 2239 var addrs []string 2240 for _, u := range c.ExternalAddr(ctx, node) { 2241 addrs = append(addrs, addrToAdminUIAddr(c, u)) 2242 } 2243 return addrs 2244 } 2245 2246 // InternalAddr returns the internal address in the form host:port for the 2247 // specified nodes. 2248 func (c *cluster) InternalAddr(ctx context.Context, node nodeListOption) []string { 2249 var addrs []string 2250 for _, u := range c.pgURL(ctx, node, false /* external */) { 2251 addrs = append(addrs, urlToAddr(c, u)) 2252 } 2253 return addrs 2254 } 2255 2256 // InternalIP returns the internal IP addresses for the specified nodes. 2257 func (c *cluster) InternalIP(ctx context.Context, node nodeListOption) []string { 2258 var ips []string 2259 for _, addr := range c.InternalAddr(ctx, node) { 2260 ips = append(ips, addrToHost(c, addr)) 2261 } 2262 return ips 2263 } 2264 2265 // ExternalAddr returns the external address in the form host:port for the 2266 // specified node. 2267 func (c *cluster) ExternalAddr(ctx context.Context, node nodeListOption) []string { 2268 var addrs []string 2269 for _, u := range c.pgURL(ctx, node, true /* external */) { 2270 addrs = append(addrs, urlToAddr(c, u)) 2271 } 2272 return addrs 2273 } 2274 2275 // ExternalIP returns the external IP addresses for the specified node. 2276 func (c *cluster) ExternalIP(ctx context.Context, node nodeListOption) []string { 2277 var ips []string 2278 for _, addr := range c.ExternalAddr(ctx, node) { 2279 ips = append(ips, addrToHost(c, addr)) 2280 } 2281 return ips 2282 } 2283 2284 // Silence unused warning. 2285 var _ = (&cluster{}).ExternalIP 2286 2287 // Conn returns a SQL connection to the specified node. 2288 func (c *cluster) Conn(ctx context.Context, node int) *gosql.DB { 2289 url := c.ExternalPGUrl(ctx, c.Node(node))[0] 2290 db, err := gosql.Open("postgres", url) 2291 if err != nil { 2292 c.t.Fatal(err) 2293 } 2294 return db 2295 } 2296 2297 // ConnE returns a SQL connection to the specified node. 2298 func (c *cluster) ConnE(ctx context.Context, node int) (*gosql.DB, error) { 2299 url := c.ExternalPGUrl(ctx, c.Node(node))[0] 2300 db, err := gosql.Open("postgres", url) 2301 if err != nil { 2302 return nil, err 2303 } 2304 return db, nil 2305 } 2306 2307 func (c *cluster) makeNodes(opts ...option) string { 2308 var r nodeListOption 2309 for _, o := range opts { 2310 if s, ok := o.(nodeSelector); ok { 2311 r = s.merge(r) 2312 } 2313 } 2314 return c.name + r.String() 2315 } 2316 2317 func (c *cluster) isLocal() bool { 2318 return c.name == "local" 2319 } 2320 2321 // Extend extends the cluster's expiration by d, after truncating d to minute 2322 // granularity. 2323 func (c *cluster) Extend(ctx context.Context, d time.Duration, l *logger) error { 2324 if ctx.Err() != nil { 2325 return errors.Wrap(ctx.Err(), "cluster.Extend") 2326 } 2327 minutes := int(d.Minutes()) 2328 l.PrintfCtx(ctx, "extending cluster by %d minutes", minutes) 2329 if out, err := execCmdWithBuffer(ctx, l, roachprod, "extend", c.name, 2330 fmt.Sprintf("--lifetime=%dm", minutes), 2331 ); err != nil { 2332 l.PrintfCtx(ctx, "roachprod extend failed: %s", out) 2333 return errors.Wrap(err, "roachprod extend failed") 2334 } 2335 // Update c.expiration. Keep it under the real expiration. 2336 c.expiration = c.expiration.Add(time.Duration((minutes - 1)) * time.Minute) 2337 return nil 2338 } 2339 2340 // getDiskUsageInBytes does what's on the tin. nodeIdx starts at one. 2341 func getDiskUsageInBytes( 2342 ctx context.Context, c *cluster, logger *logger, nodeIdx int, 2343 ) (int, error) { 2344 var out []byte 2345 for { 2346 if c.t.Failed() { 2347 return 0, errors.New("already failed") 2348 } 2349 var err error 2350 // `du` can warn if files get removed out from under it (which 2351 // happens during RocksDB compactions, for example). Discard its 2352 // stderr to avoid breaking Atoi later. 2353 // TODO(bdarnell): Refactor this stack to not combine stdout and 2354 // stderr so we don't need to do this (and the Warning check 2355 // below). 2356 out, err = c.RunWithBuffer(ctx, logger, c.Node(nodeIdx), 2357 fmt.Sprint("du -sk {store-dir} 2>/dev/null | grep -oE '^[0-9]+'")) 2358 if err != nil { 2359 if ctx.Err() != nil { 2360 return 0, ctx.Err() 2361 } 2362 // If `du` fails, retry. 2363 // TODO(bdarnell): is this worth doing? It was originally added 2364 // because of the "files removed out from under it" problem, but 2365 // that doesn't result in a command failure, just a stderr 2366 // message. 2367 logger.Printf("retrying disk usage computation after spurious error: %s", err) 2368 continue 2369 } 2370 break 2371 } 2372 2373 str := string(out) 2374 // We need this check because sometimes the first line of the roachprod output is a warning 2375 // about adding an ip to a list of known hosts. 2376 if strings.Contains(str, "Warning") { 2377 str = strings.Split(str, "\n")[1] 2378 } 2379 2380 size, err := strconv.Atoi(strings.TrimSpace(str)) 2381 if err != nil { 2382 return 0, err 2383 } 2384 2385 return size * 1024, nil 2386 } 2387 2388 type monitor struct { 2389 t testI 2390 l *logger 2391 nodes string 2392 ctx context.Context 2393 cancel func() 2394 g *errgroup.Group 2395 expDeaths int32 // atomically 2396 } 2397 2398 func newMonitor(ctx context.Context, c *cluster, opts ...option) *monitor { 2399 m := &monitor{ 2400 t: c.t, 2401 l: c.l, 2402 nodes: c.makeNodes(opts...), 2403 } 2404 m.ctx, m.cancel = context.WithCancel(ctx) 2405 m.g, m.ctx = errgroup.WithContext(m.ctx) 2406 return m 2407 } 2408 2409 // ExpectDeath lets the monitor know that a node is about to be killed, and that 2410 // this should be ignored. 2411 func (m *monitor) ExpectDeath() { 2412 m.ExpectDeaths(1) 2413 } 2414 2415 // ExpectDeaths lets the monitor know that a specific number of nodes are about 2416 // to be killed, and that they should be ignored. 2417 func (m *monitor) ExpectDeaths(count int32) { 2418 atomic.AddInt32(&m.expDeaths, count) 2419 } 2420 2421 func (m *monitor) ResetDeaths() { 2422 atomic.StoreInt32(&m.expDeaths, 0) 2423 } 2424 2425 var errGoexit = errors.New("Goexit() was called") 2426 2427 func (m *monitor) Go(fn func(context.Context) error) { 2428 m.g.Go(func() (err error) { 2429 var returned bool 2430 defer func() { 2431 if returned { 2432 return 2433 } 2434 if r := recover(); r != errGoexit && r != nil { 2435 // Pass any regular panics through. 2436 panic(r) 2437 } else { 2438 // If the invoked method called runtime.Goexit (such as it 2439 // happens when it calls t.Fatal), exit with a sentinel error 2440 // here so that the wrapped errgroup cancels itself. 2441 // 2442 // Note that the trick here is that we panicked explicitly below, 2443 // which somehow "overrides" the Goexit which is supposed to be 2444 // un-recoverable, but we do need to recover to return an error. 2445 err = errGoexit 2446 } 2447 }() 2448 if impl, ok := m.t.(*test); ok { 2449 // Automatically clear the worker status message when the goroutine exits. 2450 defer impl.WorkerStatus() 2451 } 2452 defer func() { 2453 if !returned { 2454 if r := recover(); r != nil { 2455 panic(r) 2456 } 2457 panic(errGoexit) 2458 } 2459 }() 2460 err = fn(m.ctx) 2461 returned = true 2462 return err 2463 }) 2464 } 2465 2466 func (m *monitor) WaitE() error { 2467 if m.t.Failed() { 2468 // If the test has failed, don't try to limp along. 2469 return errors.New("already failed") 2470 } 2471 2472 return errors.Wrap(m.wait(roachprod, "monitor", m.nodes), "monitor failure") 2473 } 2474 2475 func (m *monitor) Wait() { 2476 if m.t.Failed() { 2477 // If the test has failed, don't try to limp along. 2478 return 2479 } 2480 if err := m.WaitE(); err != nil { 2481 // Note that we used to avoid fataling again if we had already fatal'ed. 2482 // However, this error here might be the one to actually report, see: 2483 // https://github.com/cockroachdb/cockroach/issues/44436 2484 m.t.Fatal(err) 2485 } 2486 } 2487 2488 func (m *monitor) wait(args ...string) error { 2489 // It is surprisingly difficult to get the cancellation semantics exactly 2490 // right. We need to watch for the "workers" group (m.g) to finish, or for 2491 // the monitor command to emit an unexpected node failure, or for the monitor 2492 // command itself to exit. We want to capture whichever error happens first 2493 // and then cancel the other goroutines. This ordering prevents the usage of 2494 // an errgroup.Group for the goroutines below. Consider: 2495 // 2496 // g, _ := errgroup.WithContext(m.ctx) 2497 // g.Go(func(context.Context) error { 2498 // defer m.cancel() 2499 // return m.g.Wait() 2500 // }) 2501 // 2502 // Now consider what happens when an error is returned. Before the error 2503 // reaches the errgroup, we invoke the cancellation closure which can cause 2504 // the other goroutines to wake up and perhaps race and set the errgroup 2505 // error first. 2506 // 2507 // The solution is to implement our own errgroup mechanism here which allows 2508 // us to set the error before performing the cancellation. 2509 2510 var errOnce sync.Once 2511 var err error 2512 setErr := func(e error) { 2513 if e != nil { 2514 errOnce.Do(func() { 2515 err = e 2516 }) 2517 } 2518 } 2519 2520 // 1. The first goroutine waits for the worker errgroup to exit. 2521 var wg sync.WaitGroup 2522 wg.Add(1) 2523 go func() { 2524 defer func() { 2525 m.cancel() 2526 wg.Done() 2527 }() 2528 setErr(errors.Wrap(m.g.Wait(), "monitor task failed")) 2529 }() 2530 2531 setMonitorCmdErr := func(err error) { 2532 setErr(errors.Wrap(err, "monitor command failure")) 2533 } 2534 2535 // 2. The second goroutine forks/execs the monitoring command. 2536 pipeR, pipeW := io.Pipe() 2537 wg.Add(1) 2538 go func() { 2539 defer func() { 2540 _ = pipeW.Close() 2541 wg.Done() 2542 // NB: we explicitly do not want to call m.cancel() here as we want the 2543 // goroutine that is reading the monitoring events to be able to decide 2544 // on the error if the monitoring command exits peacefully. 2545 }() 2546 2547 monL, err := m.l.ChildLogger(`MONITOR`) 2548 if err != nil { 2549 setMonitorCmdErr(err) 2550 return 2551 } 2552 defer monL.close() 2553 2554 cmd := exec.CommandContext(m.ctx, args[0], args[1:]...) 2555 cmd.Stdout = io.MultiWriter(pipeW, monL.stdout) 2556 cmd.Stderr = monL.stderr 2557 if err := cmd.Run(); err != nil { 2558 if !errors.Is(err, context.Canceled) && !strings.Contains(err.Error(), "killed") { 2559 // The expected reason for an error is that the monitor was killed due 2560 // to the context being canceled. Any other error is an actual error. 2561 setMonitorCmdErr(err) 2562 return 2563 } 2564 } 2565 // Returning will cause the pipe to be closed which will cause the reader 2566 // goroutine to exit and close the monitoring channel. 2567 }() 2568 2569 // 3. The third goroutine reads from the monitoring pipe, watching for any 2570 // unexpected death events. 2571 wg.Add(1) 2572 go func() { 2573 defer func() { 2574 _ = pipeR.Close() 2575 m.cancel() 2576 wg.Done() 2577 }() 2578 2579 scanner := bufio.NewScanner(pipeR) 2580 for scanner.Scan() { 2581 msg := scanner.Text() 2582 var id int 2583 var s string 2584 if n, _ := fmt.Sscanf(msg, "%d: %s", &id, &s); n == 2 { 2585 if strings.Contains(s, "dead") && atomic.AddInt32(&m.expDeaths, -1) < 0 { 2586 setErr(fmt.Errorf("unexpected node event: %s", msg)) 2587 return 2588 } 2589 } 2590 } 2591 }() 2592 2593 wg.Wait() 2594 return err 2595 } 2596 2597 func waitForFullReplication(t *test, db *gosql.DB) { 2598 t.l.Printf("waiting for up-replication...\n") 2599 tStart := timeutil.Now() 2600 for ok := false; !ok; time.Sleep(time.Second) { 2601 if err := db.QueryRow( 2602 "SELECT min(array_length(replicas, 1)) >= 3 FROM crdb_internal.ranges", 2603 ).Scan(&ok); err != nil { 2604 t.Fatal(err) 2605 } 2606 if timeutil.Since(tStart) > 30*time.Second { 2607 t.l.Printf("still waiting for full replication") 2608 } 2609 } 2610 } 2611 2612 type loadGroup struct { 2613 roachNodes nodeListOption 2614 loadNodes nodeListOption 2615 } 2616 2617 type loadGroupList []loadGroup 2618 2619 func (lg loadGroupList) roachNodes() nodeListOption { 2620 var roachNodes nodeListOption 2621 for _, g := range lg { 2622 roachNodes = roachNodes.merge(g.roachNodes) 2623 } 2624 return roachNodes 2625 } 2626 2627 func (lg loadGroupList) loadNodes() nodeListOption { 2628 var loadNodes nodeListOption 2629 for _, g := range lg { 2630 loadNodes = loadNodes.merge(g.loadNodes) 2631 } 2632 return loadNodes 2633 } 2634 2635 // makeLoadGroups create a loadGroupList that has an equal number of cockroach 2636 // nodes per zone. It assumes that numLoadNodes <= numZones and that numZones is 2637 // divisible by numLoadNodes. 2638 func makeLoadGroups(c *cluster, numZones, numRoachNodes, numLoadNodes int) loadGroupList { 2639 if numLoadNodes > numZones { 2640 panic("cannot have more than one load node per zone") 2641 } else if numZones%numLoadNodes != 0 { 2642 panic("numZones must be divisible by numLoadNodes") 2643 } 2644 // roachprod allocates nodes over regions in a round-robin fashion. 2645 // If the number of nodes is not divisible by the number of regions, the 2646 // extra nodes are allocated in a round-robin fashion over the regions at 2647 // the end of cluster. 2648 loadNodesAtTheEnd := numLoadNodes%numZones != 0 2649 loadGroups := make(loadGroupList, numLoadNodes) 2650 roachNodesPerGroup := numRoachNodes / numLoadNodes 2651 for i := range loadGroups { 2652 if loadNodesAtTheEnd { 2653 first := i*roachNodesPerGroup + 1 2654 loadGroups[i].roachNodes = c.Range(first, first+roachNodesPerGroup-1) 2655 loadGroups[i].loadNodes = c.Node(numRoachNodes + i + 1) 2656 } else { 2657 first := i*(roachNodesPerGroup+1) + 1 2658 loadGroups[i].roachNodes = c.Range(first, first+roachNodesPerGroup-1) 2659 loadGroups[i].loadNodes = c.Node((i + 1) * (roachNodesPerGroup + 1)) 2660 } 2661 } 2662 return loadGroups 2663 }