github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/main.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "encoding/json" 15 "fmt" 16 "log" 17 "net" 18 "os" 19 "os/exec" 20 "os/user" 21 "path" 22 "path/filepath" 23 "regexp" 24 "runtime" 25 "sort" 26 "strings" 27 "text/tabwriter" 28 "time" 29 30 cld "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/cloud" 31 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config" 32 rperrors "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/errors" 33 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/install" 34 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ssh" 35 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ui" 36 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm" 37 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/aws" 38 _ "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/azure" 39 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/gce" 40 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/local" 41 "github.com/cockroachdb/cockroach/pkg/util/flagutil" 42 "github.com/cockroachdb/errors" 43 "github.com/spf13/cobra" 44 "golang.org/x/crypto/ssh/terminal" 45 "golang.org/x/sys/unix" 46 ) 47 48 var rootCmd = &cobra.Command{ 49 Use: "roachprod [command] (flags)", 50 Short: "roachprod tool for manipulating test clusters", 51 Long: `roachprod is a tool for manipulating ephemeral test clusters, allowing easy 52 creating, destruction, starting, stopping and wiping of clusters along with 53 running load generators. 54 55 Examples: 56 57 roachprod create local -n 3 58 roachprod start local 59 roachprod sql local:2 -- -e "select * from crdb_internal.node_runtime_info" 60 roachprod stop local 61 roachprod wipe local 62 roachprod destroy local 63 64 The above commands will create a "local" 3 node cluster, start a cockroach 65 cluster on these nodes, run a sql command on the 2nd node, stop, wipe and 66 destroy the cluster. 67 `, 68 } 69 70 var ( 71 numNodes int 72 numRacks int 73 username string 74 dryrun bool 75 destroyAllMine bool 76 extendLifetime time.Duration 77 wipePreserveCerts bool 78 listDetails bool 79 listJSON bool 80 listMine bool 81 clusterType = "cockroach" 82 secure = false 83 nodeEnv = "COCKROACH_ENABLE_RPC_COMPRESSION=false" 84 nodeArgs []string 85 tag string 86 external = false 87 adminurlOpen = false 88 adminurlPath = "" 89 adminurlIPs = false 90 useTreeDist = true 91 encrypt = false 92 quiet = false 93 sig = 9 94 waitFlag = false 95 stageOS string 96 logsDir string 97 logsFilter string 98 logsProgramFilter string 99 logsFrom time.Time 100 logsTo time.Time 101 logsInterval time.Duration 102 maxConcurrency int 103 104 monitorIgnoreEmptyNodes bool 105 monitorOneShot bool 106 107 cachedHostsCluster string 108 ) 109 110 func sortedClusters() []string { 111 var r []string 112 for n := range install.Clusters { 113 r = append(r, n) 114 } 115 sort.Strings(r) 116 return r 117 } 118 119 func newCluster(name string) (*install.SyncedCluster, error) { 120 nodeNames := "all" 121 { 122 parts := strings.Split(name, ":") 123 switch len(parts) { 124 case 2: 125 nodeNames = parts[1] 126 fallthrough 127 case 1: 128 name = parts[0] 129 case 0: 130 return nil, fmt.Errorf("no cluster specified") 131 default: 132 return nil, fmt.Errorf("invalid cluster name: %s", name) 133 } 134 } 135 136 c, ok := install.Clusters[name] 137 if !ok { 138 err := errors.Newf(`unknown cluster: %s`, name) 139 err = errors.WithHintf(err, ` 140 Available clusters: 141 %s 142 `, strings.Join(sortedClusters(), "\n ")) 143 err = errors.WithHint(err, `Use "roachprod sync" to update the list of available clusters.`) 144 return nil, err 145 } 146 147 switch clusterType { 148 case "cockroach": 149 c.Impl = install.Cockroach{} 150 if numRacks > 0 { 151 for i := range c.Localities { 152 rack := fmt.Sprintf("rack=%d", i%numRacks) 153 if c.Localities[i] != "" { 154 rack = "," + rack 155 } 156 c.Localities[i] += rack 157 } 158 } 159 case "cassandra": 160 c.Impl = install.Cassandra{} 161 default: 162 return nil, fmt.Errorf("unknown cluster type: %s", clusterType) 163 } 164 165 nodes, err := install.ListNodes(nodeNames, len(c.VMs)) 166 if err != nil { 167 return nil, err 168 } 169 for _, n := range nodes { 170 if n > len(c.VMs) { 171 return nil, fmt.Errorf("invalid node spec %s, cluster contains %d nodes", 172 nodeNames, len(c.VMs)) 173 } 174 } 175 c.Nodes = nodes 176 c.Secure = secure 177 c.Env = nodeEnv 178 c.Args = nodeArgs 179 if tag != "" { 180 c.Tag = "/" + tag 181 } 182 c.UseTreeDist = useTreeDist 183 c.Quiet = quiet || !terminal.IsTerminal(int(os.Stdout.Fd())) 184 c.MaxConcurrency = maxConcurrency 185 return c, nil 186 } 187 188 // verifyClusterName ensures that the given name conforms to 189 // our naming pattern of "<username>-<clustername>". The 190 // username must match one of the vm.Provider account names 191 // or the --username override. 192 func verifyClusterName(clusterName string) (string, error) { 193 if len(clusterName) == 0 { 194 return "", fmt.Errorf("cluster name cannot be blank") 195 } 196 if clusterName == config.Local { 197 return clusterName, nil 198 } 199 200 alphaNum, err := regexp.Compile(`^[a-zA-Z0-9\-]+$`) 201 if err != nil { 202 return "", err 203 } 204 if !alphaNum.MatchString(clusterName) { 205 return "", errors.Errorf("cluster name must match %s", alphaNum.String()) 206 } 207 208 // Use the vm.Provider account names, or --username. 209 var accounts []string 210 if len(username) > 0 { 211 accounts = []string{username} 212 } else { 213 seenAccounts := map[string]bool{} 214 active, err := vm.FindActiveAccounts() 215 if err != nil { 216 return "", err 217 } 218 for _, account := range active { 219 if !seenAccounts[account] { 220 seenAccounts[account] = true 221 accounts = append(accounts, account) 222 } 223 } 224 } 225 226 // If we see <account>-<something>, accept it. 227 for _, account := range accounts { 228 if strings.HasPrefix(clusterName, account+"-") && len(clusterName) > len(account)+1 { 229 return clusterName, nil 230 } 231 } 232 233 // Try to pick out a reasonable cluster name from the input. 234 i := strings.Index(clusterName, "-") 235 suffix := clusterName 236 if i != -1 { 237 // The user specified a username prefix, but it didn't match an active 238 // account name. For example, assuming the account is "peter", `roachprod 239 // create joe-perf` should be specified as `roachprod create joe-perf -u 240 // joe`. 241 suffix = clusterName[i+1:] 242 } else { 243 // The user didn't specify a username prefix. For example, assuming the 244 // account is "peter", `roachprod create perf` should be specified as 245 // `roachprod create peter-perf`. 246 _ = 0 247 } 248 249 // Suggest acceptable cluster names. 250 var suggestions []string 251 for _, account := range accounts { 252 suggestions = append(suggestions, fmt.Sprintf("%s-%s", account, suffix)) 253 } 254 return "", fmt.Errorf("malformed cluster name %s, did you mean one of %s", 255 clusterName, suggestions) 256 } 257 258 // Provide `cobra.Command` functions with a standard return code handler. 259 // Exit codes come from rperrors.Error.ExitCode(). 260 // 261 // If the wrapped error tree of an error does not contain an instance of 262 // rperrors.Error, the error will automatically be wrapped with 263 // rperrors.Unclassified. 264 func wrap(f func(cmd *cobra.Command, args []string) error) func(cmd *cobra.Command, args []string) { 265 return func(cmd *cobra.Command, args []string) { 266 err := f(cmd, args) 267 if err != nil { 268 roachprodError, ok := rperrors.AsError(err) 269 if !ok { 270 roachprodError = rperrors.Unclassified{Err: err} 271 err = roachprodError 272 } 273 274 cmd.Printf("Error: %+v\n", err) 275 276 os.Exit(roachprodError.ExitCode()) 277 } 278 } 279 } 280 281 var createVMOpts vm.CreateOpts 282 283 type clusterAlreadyExistsError struct { 284 name string 285 } 286 287 func (e *clusterAlreadyExistsError) Error() string { 288 return fmt.Sprintf("cluster %s already exists", e.name) 289 } 290 291 func newClusterAlreadyExistsError(name string) error { 292 return &clusterAlreadyExistsError{name: name} 293 } 294 295 var createCmd = &cobra.Command{ 296 Use: "create <cluster>", 297 Short: "create a cluster", 298 Long: `Create a local or cloud-based cluster. 299 300 A cluster is composed of a set of nodes, configured during cluster creation via 301 the --nodes flag. Creating a cluster does not start any processes on the nodes 302 other than the base system processes (e.g. sshd). See "roachprod start" for 303 starting cockroach nodes and "roachprod {run,ssh}" for running arbitrary 304 commands on the nodes of a cluster. 305 306 Cloud Clusters 307 308 Cloud-based clusters are ephemeral and come with a lifetime (specified by the 309 --lifetime flag) after which they will be automatically 310 destroyed. Cloud-based clusters require the associated command line tool for 311 the cloud to be installed and configured (e.g. "gcloud auth login"). 312 313 Clusters names are required to be prefixed by the authenticated user of the 314 cloud service. The suffix is an arbitrary string used to distinguish 315 clusters. For example, "marc-test" is a valid cluster name for the user 316 "marc". The authenticated user for the cloud service is automatically 317 detected and can be override by the ROACHPROD_USER environment variable or 318 the --username flag. 319 320 The machine type and the use of local SSD storage can be specified during 321 cluster creation via the --{cloud}-machine-type and --local-ssd flags. The 322 machine-type is cloud specified. For example, --gce-machine-type=n1-highcpu-8 323 requests the "n1-highcpu-8" machine type for a GCE-based cluster. No attempt 324 is made (or desired) to abstract machine types across cloud providers. See 325 the cloud provider's documentation for details on the machine types 326 available. 327 328 Local Clusters 329 330 A local cluster stores the per-node data in ${HOME}/local on the machine 331 roachprod is being run on. Local clusters requires local ssh access. Unlike 332 cloud clusters there can be only a single local cluster, the local cluster is 333 always named "local", and has no expiration (unlimited lifetime). 334 `, 335 Args: cobra.ExactArgs(1), 336 Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) { 337 if numNodes <= 0 || numNodes >= 1000 { 338 // Upper limit is just for safety. 339 return fmt.Errorf("number of nodes must be in [1..999]") 340 } 341 342 clusterName, err := verifyClusterName(args[0]) 343 if err != nil { 344 return err 345 } 346 createVMOpts.ClusterName = clusterName 347 348 defer func() { 349 if retErr == nil || clusterName == config.Local { 350 return 351 } 352 if errors.HasType(retErr, (*clusterAlreadyExistsError)(nil)) { 353 return 354 } 355 fmt.Fprintf(os.Stderr, "Cleaning up partially-created cluster (prev err: %s)\n", retErr) 356 if err := cleanupFailedCreate(clusterName); err != nil { 357 fmt.Fprintf(os.Stderr, "Error while cleaning up partially-created cluster: %s\n", err) 358 } else { 359 fmt.Fprintf(os.Stderr, "Cleaning up OK\n") 360 } 361 }() 362 363 if clusterName != config.Local { 364 cloud, err := cld.ListCloud() 365 if err != nil { 366 return err 367 } 368 if _, ok := cloud.Clusters[clusterName]; ok { 369 return newClusterAlreadyExistsError(clusterName) 370 } 371 } else { 372 if _, ok := install.Clusters[clusterName]; ok { 373 return newClusterAlreadyExistsError(clusterName) 374 } 375 376 // If the local cluster is being created, force the local Provider to be used 377 createVMOpts.VMProviders = []string{local.ProviderName} 378 } 379 380 fmt.Printf("Creating cluster %s with %d nodes\n", clusterName, numNodes) 381 if createErr := cld.CreateCluster(numNodes, createVMOpts); createErr != nil { 382 return createErr 383 } 384 385 // Just create directories for the local cluster as there's no need for ssh. 386 if clusterName == config.Local { 387 for i := 0; i < numNodes; i++ { 388 err := os.MkdirAll(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), i+1), 0755) 389 if err != nil { 390 return err 391 } 392 } 393 return nil 394 } 395 return setupSSH(clusterName) 396 }), 397 } 398 399 var setupSSHCmd = &cobra.Command{ 400 Use: "setup-ssh <cluster>", 401 Short: "set up ssh for a cluster", 402 Long: `Sets up the keys and host keys for the vms in the cluster. 403 404 It first resets the machine credentials as though the cluster were newly created 405 using the cloud provider APIs and then proceeds to ensure that the hosts can 406 SSH into eachother and lastly adds additional public keys to AWS hosts as read 407 from the GCP project. This operation is performed as the last step of creating 408 a new cluster but can be useful to re-run if the operation failed previously or 409 if the user would like to update the keys on the remote hosts. 410 `, 411 412 Args: cobra.ExactArgs(1), 413 Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) { 414 clusterName, err := verifyClusterName(args[0]) 415 if err != nil { 416 return err 417 } 418 return setupSSH(clusterName) 419 }), 420 } 421 422 func setupSSH(clusterName string) error { 423 cloud, err := syncCloud(quiet) 424 if err != nil { 425 return err 426 } 427 cloudCluster, ok := cloud.Clusters[clusterName] 428 if !ok { 429 return fmt.Errorf("could not find %s in list of cluster", clusterName) 430 } 431 cloudCluster.PrintDetails() 432 // Run ssh-keygen -R serially on each new VM in case an IP address has been recycled 433 for _, v := range cloudCluster.VMs { 434 cmd := exec.Command("ssh-keygen", "-R", v.PublicIP) 435 out, err := cmd.CombinedOutput() 436 if err != nil { 437 log.Printf("could not clear ssh key for hostname %s:\n%s", v.PublicIP, string(out)) 438 } 439 440 } 441 442 // Wait for the nodes in the cluster to start. 443 install.Clusters = map[string]*install.SyncedCluster{} 444 if err := loadClusters(); err != nil { 445 return err 446 } 447 installCluster, err := newCluster(clusterName) 448 if err != nil { 449 return err 450 } 451 // For GCP clusters we need to use the config.OSUser even if the client 452 // requested the shared user. 453 for i := range installCluster.VMs { 454 if cloudCluster.VMs[i].Provider == gce.ProviderName { 455 installCluster.Users[i] = config.OSUser.Username 456 } 457 } 458 if err := installCluster.Wait(); err != nil { 459 return err 460 } 461 // Fetch public keys from gcloud to set up ssh access for all users into the 462 // shared ubuntu user. 463 installCluster.AuthorizedKeys, err = gce.GetUserAuthorizedKeys() 464 if err != nil { 465 return errors.Wrap(err, "failed to retrieve authorized keys from gcloud") 466 } 467 return installCluster.SetupSSH() 468 } 469 470 func cleanupFailedCreate(clusterName string) error { 471 cloud, err := cld.ListCloud() 472 if err != nil { 473 return err 474 } 475 c, ok := cloud.Clusters[clusterName] 476 if !ok { 477 // If the cluster doesn't exist, we didn't manage to create any VMs 478 // before failing. Not an error. 479 return nil 480 } 481 return cld.DestroyCluster(c) 482 } 483 484 var destroyCmd = &cobra.Command{ 485 Use: "destroy [ --all-mine | <cluster 1> [<cluster 2> ...] ]", 486 Short: "destroy clusters", 487 Long: `Destroy one or more local or cloud-based clusters. 488 489 The destroy command accepts the names of the clusters to destroy. Alternatively, 490 the --all-mine flag can be provided to destroy all clusters that are owned by the 491 current user. 492 493 Destroying a cluster releases the resources for a cluster. For a cloud-based 494 cluster the machine and associated disk resources are freed. For a local 495 cluster, any processes started by roachprod are stopped, and the ${HOME}/local 496 directory is removed. 497 `, 498 Args: cobra.ArbitraryArgs, 499 Run: wrap(func(cmd *cobra.Command, args []string) error { 500 switch len(args) { 501 case 0: 502 if !destroyAllMine { 503 return errors.New("no cluster name provided") 504 } 505 506 destroyPattern, err := userClusterNameRegexp() 507 if err != nil { 508 return err 509 } 510 511 cloud, err := cld.ListCloud() 512 if err != nil { 513 return err 514 } 515 516 var names []string 517 for name := range cloud.Clusters { 518 if destroyPattern.MatchString(name) { 519 names = append(names, name) 520 } 521 } 522 sort.Strings(names) 523 524 for _, clusterName := range names { 525 if err := destroyCluster(cloud, clusterName); err != nil { 526 return err 527 } 528 } 529 default: 530 if destroyAllMine { 531 return errors.New("--all-mine cannot be combined with cluster names") 532 } 533 534 var cloud *cld.Cloud 535 for _, arg := range args { 536 clusterName, err := verifyClusterName(arg) 537 if err != nil { 538 return err 539 } 540 541 if clusterName != config.Local { 542 if cloud == nil { 543 cloud, err = cld.ListCloud() 544 if err != nil { 545 return err 546 } 547 } 548 549 if err := destroyCluster(cloud, clusterName); err != nil { 550 return err 551 } 552 } else { 553 if err := destroyLocalCluster(); err != nil { 554 return err 555 } 556 } 557 } 558 } 559 fmt.Println("OK") 560 return nil 561 }), 562 } 563 564 func destroyCluster(cloud *cld.Cloud, clusterName string) error { 565 c, ok := cloud.Clusters[clusterName] 566 if !ok { 567 return fmt.Errorf("cluster %s does not exist", clusterName) 568 } 569 fmt.Printf("Destroying cluster %s with %d nodes\n", clusterName, len(c.VMs)) 570 return cld.DestroyCluster(c) 571 } 572 573 func destroyLocalCluster() error { 574 if _, ok := install.Clusters[config.Local]; !ok { 575 return fmt.Errorf("cluster %s does not exist", config.Local) 576 } 577 c, err := newCluster(config.Local) 578 if err != nil { 579 return err 580 } 581 c.Wipe(false) 582 for _, i := range c.Nodes { 583 err := os.RemoveAll(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), i)) 584 if err != nil { 585 return err 586 } 587 } 588 return os.Remove(filepath.Join(os.ExpandEnv(config.DefaultHostDir), c.Name)) 589 } 590 591 var cachedHostsCmd = &cobra.Command{ 592 Use: "cached-hosts", 593 Short: "list all clusters (and optionally their host numbers) from local cache", 594 Args: cobra.NoArgs, 595 Run: wrap(func(cmd *cobra.Command, args []string) error { 596 if err := loadClusters(); err != nil { 597 return err 598 } 599 600 names := make([]string, 0, len(install.Clusters)) 601 for name := range install.Clusters { 602 names = append(names, name) 603 } 604 sort.Strings(names) 605 606 for _, name := range names { 607 c := install.Clusters[name] 608 if strings.HasPrefix(c.Name, "teamcity") { 609 continue 610 } 611 fmt.Print(c.Name) 612 // when invokved by bash-completion, cachedHostsCluster is what the user 613 // has currently typed -- if this cluster matches that, expand its hosts. 614 if strings.HasPrefix(cachedHostsCluster, c.Name) { 615 for i := range c.VMs { 616 fmt.Printf(" %s:%d", c.Name, i+1) 617 } 618 } 619 fmt.Println() 620 } 621 return nil 622 }), 623 } 624 625 var listCmd = &cobra.Command{ 626 Use: "list [--details] [ --mine | <cluster name regex> ]", 627 Short: "list all clusters", 628 Long: `List all clusters. 629 630 The list command accepts an optional positional argument, which is a regular 631 expression that will be matched against the cluster name pattern. Alternatively, 632 the --mine flag can be provided to list the clusters that are owned by the current 633 user. 634 635 The default output shows one line per cluster, including the local cluster if 636 it exists: 637 638 ~ roachprod list 639 local: [local] 1 (-) 640 marc-test: [aws gce] 4 (5h34m35s) 641 Syncing... 642 643 The second column lists the cloud providers that host VMs for the cluster. 644 645 The third and fourth columns are the number of nodes in the cluster and the 646 time remaining before the cluster will be automatically destroyed. Note that 647 local clusters do not have an expiration. 648 649 The --details flag adjusts the output format to include per-node details: 650 651 ~ roachprod list --details 652 local [local]: (no expiration) 653 localhost 127.0.0.1 127.0.0.1 654 marc-test: [aws gce] 5h33m57s remaining 655 marc-test-0001 marc-test-0001.us-east1-b.cockroach-ephemeral 10.142.0.18 35.229.60.91 656 marc-test-0002 marc-test-0002.us-east1-b.cockroach-ephemeral 10.142.0.17 35.231.0.44 657 marc-test-0003 marc-test-0003.us-east1-b.cockroach-ephemeral 10.142.0.19 35.229.111.100 658 marc-test-0004 marc-test-0004.us-east1-b.cockroach-ephemeral 10.142.0.20 35.231.102.125 659 Syncing... 660 661 The first and second column are the node hostname and fully qualified name 662 respectively. The third and fourth column are the private and public IP 663 addresses. 664 665 The --json flag sets the format of the command output to json. 666 667 Listing clusters has the side-effect of syncing ssh keys/configs and the local 668 hosts file. 669 `, 670 Args: cobra.RangeArgs(0, 1), 671 Run: wrap(func(cmd *cobra.Command, args []string) error { 672 listPattern := regexp.MustCompile(".*") 673 switch len(args) { 674 case 0: 675 if listMine { 676 var err error 677 listPattern, err = userClusterNameRegexp() 678 if err != nil { 679 return err 680 } 681 } 682 case 1: 683 if listMine { 684 return errors.New("--mine cannot be combined with a pattern") 685 } 686 var err error 687 listPattern, err = regexp.Compile(args[0]) 688 if err != nil { 689 return errors.Wrapf(err, "could not compile regex pattern: %s", args[0]) 690 } 691 default: 692 return errors.New("only a single pattern may be listed") 693 } 694 695 cloud, err := syncCloud(quiet) 696 if err != nil { 697 return err 698 } 699 700 // Filter and sort by cluster names for stable output. 701 var names []string 702 filteredCloud := cloud.Clone() 703 for name := range cloud.Clusters { 704 if listPattern.MatchString(name) { 705 names = append(names, name) 706 } else { 707 delete(filteredCloud.Clusters, name) 708 } 709 } 710 sort.Strings(names) 711 712 if listJSON { 713 if listDetails { 714 return errors.New("--json cannot be combined with --detail") 715 } 716 717 enc := json.NewEncoder(os.Stdout) 718 enc.SetIndent("", " ") 719 if err := enc.Encode(filteredCloud); err != nil { 720 return err 721 } 722 } else { 723 // Align columns left and separate with at least two spaces. 724 tw := tabwriter.NewWriter(os.Stdout, 0, 8, 2, ' ', 0) 725 for _, name := range names { 726 c := filteredCloud.Clusters[name] 727 if listDetails { 728 c.PrintDetails() 729 } else { 730 fmt.Fprintf(tw, "%s:\t%s\t%d", c.Name, c.Clouds(), len(c.VMs)) 731 if !c.IsLocal() { 732 fmt.Fprintf(tw, "\t(%s)", c.LifetimeRemaining().Round(time.Second)) 733 } else { 734 fmt.Fprintf(tw, "\t(-)") 735 } 736 fmt.Fprintf(tw, "\n") 737 } 738 } 739 if err := tw.Flush(); err != nil { 740 return err 741 } 742 743 // Optionally print any dangling instances with errors 744 if listDetails { 745 collated := filteredCloud.BadInstanceErrors() 746 747 // Sort by Error() value for stable output 748 var errors ui.ErrorsByError 749 for err := range collated { 750 errors = append(errors, err) 751 } 752 sort.Sort(errors) 753 754 for _, e := range errors { 755 fmt.Printf("%s: %s\n", e, collated[e].Names()) 756 } 757 } 758 } 759 return nil 760 }), 761 } 762 763 // userClusterNameRegexp returns a regexp that matches all clusters owned by the 764 // current user. 765 func userClusterNameRegexp() (*regexp.Regexp, error) { 766 // In general, we expect that users will have the same 767 // account name across the services they're using, 768 // but we still want to function even if this is not 769 // the case. 770 seenAccounts := map[string]bool{} 771 accounts, err := vm.FindActiveAccounts() 772 if err != nil { 773 return nil, err 774 } 775 pattern := "" 776 for _, account := range accounts { 777 if !seenAccounts[account] { 778 seenAccounts[account] = true 779 if len(pattern) > 0 { 780 pattern += "|" 781 } 782 pattern += fmt.Sprintf("(^%s-)", regexp.QuoteMeta(account)) 783 } 784 } 785 return regexp.Compile(pattern) 786 } 787 788 // TODO(peter): Do we need this command given that the "list" command syncs as 789 // a side-effect. If you don't care about the list output, just "roachprod list 790 // &>/dev/null". 791 var syncCmd = &cobra.Command{ 792 Use: "sync", 793 Short: "sync ssh keys/config and hosts files", 794 Long: ``, 795 Args: cobra.NoArgs, 796 Run: wrap(func(cmd *cobra.Command, args []string) error { 797 _, err := syncCloud(quiet) 798 return err 799 }), 800 } 801 802 var lockFile = os.ExpandEnv("$HOME/.roachprod/LOCK") 803 804 var bashCompletion = os.ExpandEnv("$HOME/.roachprod/bash-completion.sh") 805 806 // syncCloud grabs an exclusive lock on the roachprod state and then proceeds to 807 // read the current state from the cloud and write it out to disk. The locking 808 // protects both the reading and the writing in order to prevent the hazard 809 // caused by concurrent goroutines reading cloud state in a different order 810 // than writing it to disk. 811 func syncCloud(quiet bool) (*cld.Cloud, error) { 812 if !quiet { 813 fmt.Println("Syncing...") 814 } 815 // Acquire a filesystem lock so that two concurrent synchronizations of 816 // roachprod state don't clobber each other. 817 f, err := os.Create(lockFile) 818 if err != nil { 819 return nil, errors.Wrapf(err, "creating lock file %q", lockFile) 820 } 821 if err := unix.Flock(int(f.Fd()), unix.LOCK_EX); err != nil { 822 return nil, errors.Wrap(err, "acquiring lock on %q") 823 } 824 defer f.Close() 825 cloud, err := cld.ListCloud() 826 if err != nil { 827 return nil, err 828 } 829 if err := syncHosts(cloud); err != nil { 830 return nil, err 831 } 832 833 var vms vm.List 834 for _, c := range cloud.Clusters { 835 vms = append(vms, c.VMs...) 836 } 837 838 // Figure out if we're going to overwrite the DNS entries. We don't want to 839 // overwrite if we don't have all the VMs of interest, so we only do it if we 840 // have a list of all VMs from both AWS and GCE (so if both providers have 841 // been used to get the VMs and for GCP also if we listed the VMs in the 842 // default project). 843 refreshDNS := true 844 845 if p := vm.Providers[gce.ProviderName]; !p.Active() { 846 refreshDNS = false 847 } else { 848 var defaultProjectFound bool 849 for _, prj := range p.(*gce.Provider).GetProjects() { 850 if prj == gce.DefaultProject() { 851 defaultProjectFound = true 852 break 853 } 854 } 855 if !defaultProjectFound { 856 refreshDNS = false 857 } 858 } 859 if !vm.Providers[aws.ProviderName].Active() { 860 refreshDNS = false 861 } 862 // DNS entries are maintained in the GCE DNS registry for all vms, from all 863 // clouds. 864 if refreshDNS { 865 if !quiet { 866 fmt.Println("Refreshing DNS entries...") 867 } 868 if err := gce.SyncDNS(vms); err != nil { 869 fmt.Fprintf(os.Stderr, "failed to update %s DNS: %v", gce.Subdomain, err) 870 } 871 } else { 872 if !quiet { 873 fmt.Println("Not refreshing DNS entries. We did not have all the VMs.") 874 } 875 } 876 877 if err := vm.ProvidersSequential(vm.AllProviderNames(), func(p vm.Provider) error { 878 return p.CleanSSH() 879 }); err != nil { 880 return nil, err 881 } 882 883 _ = rootCmd.GenBashCompletionFile(bashCompletion) 884 885 if err := vm.ProvidersSequential(vm.AllProviderNames(), func(p vm.Provider) error { 886 return p.ConfigSSH() 887 }); err != nil { 888 return nil, err 889 } 890 return cloud, nil 891 } 892 893 var gcCmd = &cobra.Command{ 894 Use: "gc", 895 Short: "GC expired clusters\n", 896 Long: `Garbage collect expired clusters. 897 898 Destroys expired clusters, sending email if properly configured. Usually run 899 hourly by a cronjob so it is not necessary to run manually. 900 `, 901 Args: cobra.NoArgs, 902 Run: wrap(func(cmd *cobra.Command, args []string) error { 903 cloud, err := cld.ListCloud() 904 if err != nil { 905 return err 906 } 907 return cld.GCClusters(cloud, dryrun) 908 }), 909 } 910 911 var extendCmd = &cobra.Command{ 912 Use: "extend <cluster>", 913 Short: "extend the lifetime of a cluster", 914 Long: `Extend the lifetime of the specified cluster to prevent it from being 915 destroyed: 916 917 roachprod extend marc-test --lifetime=6h 918 `, 919 Args: cobra.ExactArgs(1), 920 Run: wrap(func(cmd *cobra.Command, args []string) error { 921 clusterName, err := verifyClusterName(args[0]) 922 if err != nil { 923 return err 924 } 925 926 cloud, err := cld.ListCloud() 927 if err != nil { 928 return err 929 } 930 931 c, ok := cloud.Clusters[clusterName] 932 if !ok { 933 return fmt.Errorf("cluster %s does not exist", clusterName) 934 } 935 936 if err := cld.ExtendCluster(c, extendLifetime); err != nil { 937 return err 938 } 939 940 // Reload the clusters and print details. 941 cloud, err = cld.ListCloud() 942 if err != nil { 943 return err 944 } 945 946 c, ok = cloud.Clusters[clusterName] 947 if !ok { 948 return fmt.Errorf("cluster %s does not exist", clusterName) 949 } 950 951 c.PrintDetails() 952 return nil 953 }), 954 } 955 956 const tagHelp = ` 957 The --tag flag can be used to to associate a tag with the process. This tag can 958 then be used to restrict the processes which are operated on by the status and 959 stop commands. Tags can have a hierarchical component by utilizing a slash 960 separated string similar to a filesystem path. A tag matches if a prefix of the 961 components match. For example, the tag "a/b" will match both "a/b" and 962 "a/b/c/d". 963 ` 964 965 var startCmd = &cobra.Command{ 966 Use: "start <cluster>", 967 Short: "start nodes on a cluster", 968 Long: `Start nodes on a cluster. 969 970 The --secure flag can be used to start nodes in secure mode (i.e. using 971 certs). When specified, there is a one time initialization for the cluster to 972 create and distribute the certs. Note that running some modes in secure mode 973 and others in insecure mode is not a supported Cockroach configuration. 974 975 As a debugging aid, the --sequential flag starts the nodes sequentially so node 976 IDs match hostnames. Otherwise nodes are started are parallel. 977 978 The --binary flag specifies the remote binary to run. It is up to the roachprod 979 user to ensure this binary exists, usually via "roachprod put". Note that no 980 cockroach software is installed by default on a newly created cluster. 981 982 The --args and --env flags can be used to pass arbitrary command line flags and 983 environment variables to the cockroach process. 984 ` + tagHelp + ` 985 The "start" command takes care of setting up the --join address and specifying 986 reasonable defaults for other flags. One side-effect of this convenience is 987 that node 1 is special and must be started for the cluster to be initialized. 988 989 If the COCKROACH_DEV_LICENSE environment variable is set the enterprise.license 990 cluster setting will be set to its value. 991 `, 992 Args: cobra.ExactArgs(1), 993 Run: wrap(func(cmd *cobra.Command, args []string) error { 994 c, err := newCluster(args[0]) 995 if err != nil { 996 return err 997 } 998 c.Start() 999 return nil 1000 }), 1001 } 1002 1003 var stopCmd = &cobra.Command{ 1004 Use: "stop <cluster> [--sig] [--wait]", 1005 Short: "stop nodes on a cluster", 1006 Long: `Stop nodes on a cluster. 1007 1008 Stop roachprod created processes running on the nodes in a cluster, including 1009 processes started by the "start", "run" and "ssh" commands. Every process 1010 started by roachprod is tagged with a ROACHPROD=<node> environment variable 1011 which is used by "stop" to locate the processes and terminate them. By default 1012 processes are killed with signal 9 (SIGKILL) giving them no chance for a graceful 1013 exit. 1014 1015 The --sig flag will pass a signal to kill to allow us finer control over how we 1016 shutdown cockroach. The --wait flag causes stop to loop waiting for all 1017 processes with the ROACHPROD=<node> environment variable to exit. Note that 1018 stop will wait forever if you specify --wait with a non-terminating signal 1019 (e.g. SIGHUP). --wait defaults to true for signal 9 (SIGKILL) and false for all 1020 other signals. 1021 ` + tagHelp + ` 1022 `, 1023 Args: cobra.ExactArgs(1), 1024 Run: wrap(func(cmd *cobra.Command, args []string) error { 1025 c, err := newCluster(args[0]) 1026 if err != nil { 1027 return err 1028 } 1029 wait := waitFlag 1030 if sig == 9 /* SIGKILL */ && !cmd.Flags().Changed("wait") { 1031 wait = true 1032 } 1033 c.Stop(sig, wait) 1034 return nil 1035 }), 1036 } 1037 1038 var statusCmd = &cobra.Command{ 1039 Use: "status <cluster>", 1040 Short: "retrieve the status of nodes in a cluster", 1041 Long: `Retrieve the status of nodes in a cluster. 1042 1043 The "status" command outputs the binary and PID for the specified nodes: 1044 1045 ~ roachprod status local 1046 local: status 3/3 1047 1: cockroach 29688 1048 2: cockroach 29687 1049 3: cockroach 29689 1050 ` + tagHelp + ` 1051 `, 1052 Args: cobra.ExactArgs(1), 1053 Run: wrap(func(cmd *cobra.Command, args []string) error { 1054 c, err := newCluster(args[0]) 1055 if err != nil { 1056 return err 1057 } 1058 c.Status() 1059 return nil 1060 }), 1061 } 1062 1063 var logsCmd = &cobra.Command{ 1064 Use: "logs", 1065 Short: "retrieve and merge logs in a cluster", 1066 Long: `Retrieve and merge logs in a cluster. 1067 1068 The "logs" command runs until terminated. It works similarly to get but is 1069 specifically focused on retrieving logs periodically and then merging them 1070 into a single stream. 1071 `, 1072 Args: cobra.RangeArgs(1, 2), 1073 Run: wrap(func(cmd *cobra.Command, args []string) error { 1074 c, err := newCluster(args[0]) 1075 if err != nil { 1076 return err 1077 } 1078 var dest string 1079 if len(args) == 2 { 1080 dest = args[1] 1081 } else { 1082 dest = c.Name + ".logs" 1083 } 1084 return c.Logs(logsDir, dest, username, logsFilter, logsProgramFilter, logsInterval, logsFrom, logsTo, cmd.OutOrStdout()) 1085 }), 1086 } 1087 1088 var monitorCmd = &cobra.Command{ 1089 Use: "monitor", 1090 Short: "monitor the status of nodes in a cluster", 1091 Long: `Monitor the status of cockroach nodes in a cluster. 1092 1093 The "monitor" command runs until terminated. At startup it outputs a line for 1094 each specified node indicating the status of the node (either the PID of the 1095 node if alive, or "dead" otherwise). It then watches for changes in the status 1096 of nodes, outputting a line whenever a change is detected: 1097 1098 ~ roachprod monitor local 1099 1: 29688 1100 3: 29689 1101 2: 29687 1102 3: dead 1103 3: 30718 1104 `, 1105 Args: cobra.ExactArgs(1), 1106 Run: wrap(func(cmd *cobra.Command, args []string) error { 1107 c, err := newCluster(args[0]) 1108 if err != nil { 1109 return err 1110 } 1111 for msg := range c.Monitor(monitorIgnoreEmptyNodes, monitorOneShot) { 1112 if msg.Err != nil { 1113 msg.Msg += "error: " + msg.Err.Error() 1114 } 1115 thisError := errors.Newf("%d: %s", msg.Index, msg.Msg) 1116 if msg.Err != nil || strings.Contains(msg.Msg, "dead") { 1117 err = errors.CombineErrors(err, thisError) 1118 } 1119 fmt.Println(thisError.Error()) 1120 } 1121 return err 1122 }), 1123 } 1124 1125 var wipeCmd = &cobra.Command{ 1126 Use: "wipe <cluster>", 1127 Short: "wipe a cluster", 1128 Long: `Wipe the nodes in a cluster. 1129 1130 The "wipe" command first stops any processes running on the nodes in a cluster 1131 (via the "stop" command) and then deletes the data directories used by the 1132 nodes. 1133 `, 1134 Args: cobra.ExactArgs(1), 1135 Run: wrap(func(cmd *cobra.Command, args []string) error { 1136 c, err := newCluster(args[0]) 1137 if err != nil { 1138 return err 1139 } 1140 c.Wipe(wipePreserveCerts) 1141 return nil 1142 }), 1143 } 1144 1145 var reformatCmd = &cobra.Command{ 1146 Use: "reformat <cluster> <filesystem>", 1147 Short: "reformat disks in a cluster\n", 1148 Long: ` 1149 Reformat disks in a cluster to use the specified filesystem. 1150 1151 WARNING: Reformatting will delete all existing data in the cluster. 1152 1153 Filesystem options: 1154 ext4 1155 zfs 1156 1157 When running with ZFS, you can create a snapshot of the filesystem's current 1158 state using the 'zfs snapshot' command: 1159 1160 $ roachprod run <cluster> 'sudo zfs snapshot data1@pristine' 1161 1162 You can then nearly instantaneously restore the filesystem to this state with 1163 the 'zfs rollback' command: 1164 1165 $ roachprod run <cluster> 'sudo zfs rollback data1@pristine' 1166 1167 `, 1168 1169 Args: cobra.ExactArgs(2), 1170 Run: wrap(func(cmd *cobra.Command, args []string) error { 1171 c, err := newCluster(args[0]) 1172 if err != nil { 1173 return err 1174 } 1175 1176 var fsCmd string 1177 switch fs := args[1]; fs { 1178 case "zfs": 1179 if err := install.Install(c, []string{"zfs"}); err != nil { 1180 return err 1181 } 1182 fsCmd = `sudo zpool create -f data1 -m /mnt/data1 /dev/sdb` 1183 case "ext4": 1184 fsCmd = `sudo mkfs.ext4 -F /dev/sdb && sudo mount -o discard,defaults /dev/sdb /mnt/data1` 1185 default: 1186 return fmt.Errorf("unknown filesystem %q", fs) 1187 } 1188 1189 err = c.Run(os.Stdout, os.Stderr, c.Nodes, install.OtherCmd, "reformatting", fmt.Sprintf(` 1190 set -euo pipefail 1191 if sudo zpool list -Ho name 2>/dev/null | grep ^data1$; then 1192 sudo zpool destroy -f data1 1193 fi 1194 if mountpoint -q /mnt/data1; then 1195 sudo umount -f /mnt/data1 1196 fi 1197 %s 1198 sudo chmod 777 /mnt/data1 1199 `, fsCmd)) 1200 if err != nil { 1201 fmt.Fprintf(os.Stderr, "%s\n", err) 1202 } 1203 return nil 1204 }), 1205 } 1206 1207 var runCmd = &cobra.Command{ 1208 Use: "run <cluster> <command> [args]", 1209 Aliases: []string{"ssh"}, 1210 Short: "run a command on the nodes in a cluster", 1211 Long: `Run a command on the nodes in a cluster. 1212 `, 1213 Args: cobra.MinimumNArgs(1), 1214 Run: wrap(func(_ *cobra.Command, args []string) error { 1215 c, err := newCluster(args[0]) 1216 if err != nil { 1217 return err 1218 } 1219 1220 // Use "ssh" if an interactive session was requested (i.e. there is no 1221 // remote command to run). 1222 if len(args) == 1 { 1223 return c.SSH(nil, args[1:]) 1224 } 1225 1226 cmd := strings.TrimSpace(strings.Join(args[1:], " ")) 1227 title := cmd 1228 if len(title) > 30 { 1229 title = title[:27] + "..." 1230 } 1231 return c.Run(os.Stdout, os.Stderr, c.Nodes, install.CockroachCmd, title, cmd) 1232 }), 1233 } 1234 1235 var installCmd = &cobra.Command{ 1236 Use: "install <cluster> <software>", 1237 Short: "install 3rd party software", 1238 Long: `Install third party software. Currently available installation options are: 1239 1240 ` + strings.Join(install.SortedCmds(), "\n ") + ` 1241 `, 1242 Args: cobra.MinimumNArgs(2), 1243 Run: wrap(func(cmd *cobra.Command, args []string) error { 1244 c, err := newCluster(args[0]) 1245 if err != nil { 1246 return err 1247 } 1248 return install.Install(c, args[1:]) 1249 }), 1250 } 1251 1252 var stageCmd = &cobra.Command{ 1253 Use: "stage <cluster> <application> [<sha/version>]", 1254 Short: "stage cockroach binaries", 1255 Long: `Stages release and edge binaries to the cluster. 1256 1257 Currently available application options are: 1258 cockroach - Cockroach Unofficial. Can provide an optional SHA, otherwise 1259 latest build version is used. 1260 workload - Cockroach workload application. 1261 release - Official CockroachDB Release. Must provide a specific release 1262 version. 1263 1264 Some examples of usage: 1265 -- stage edge build of cockroach build at a specific SHA: 1266 roachprod stage my-cluster cockroach e90e6903fee7dd0f88e20e345c2ddfe1af1e5a97 1267 1268 -- Stage the most recent edge build of the workload tool: 1269 roachprod stage my-cluster workload 1270 1271 -- Stage the official release binary of CockroachDB at version 2.0.5 1272 roachprod stage my-cluster release v2.0.5 1273 `, 1274 Args: cobra.RangeArgs(2, 3), 1275 Run: wrap(func(cmd *cobra.Command, args []string) error { 1276 c, err := newCluster(args[0]) 1277 if err != nil { 1278 return err 1279 } 1280 1281 os := "linux" 1282 if stageOS != "" { 1283 os = stageOS 1284 } else if c.IsLocal() { 1285 os = runtime.GOOS 1286 } 1287 var debugArch, releaseArch string 1288 switch os { 1289 case "linux": 1290 debugArch, releaseArch = "linux-gnu-amd64", "linux-amd64" 1291 case "darwin": 1292 debugArch, releaseArch = "darwin-amd64", "darwin-10.9-amd64" 1293 case "windows": 1294 debugArch, releaseArch = "windows-amd64", "windows-6.2-amd64" 1295 default: 1296 return errors.Errorf("cannot stage binary on %s", os) 1297 } 1298 1299 applicationName := args[1] 1300 versionArg := "" 1301 if len(args) == 3 { 1302 versionArg = args[2] 1303 } 1304 switch applicationName { 1305 case "cockroach": 1306 return install.StageRemoteBinary( 1307 c, applicationName, "cockroach/cockroach", versionArg, debugArch, 1308 ) 1309 case "workload": 1310 return install.StageRemoteBinary( 1311 c, applicationName, "cockroach/workload", versionArg, "", /* arch */ 1312 ) 1313 case "release": 1314 return install.StageCockroachRelease(c, versionArg, releaseArch) 1315 default: 1316 return fmt.Errorf("unknown application %s", applicationName) 1317 } 1318 }), 1319 } 1320 1321 var distributeCertsCmd = &cobra.Command{ 1322 Use: "distribute-certs <cluster>", 1323 Short: "distribute certificates to the nodes in a cluster", 1324 Long: `Distribute certificates to the nodes in a cluster. 1325 If the certificates already exist, no action is taken. Note that this command is 1326 invoked automatically when a secure cluster is bootstrapped by "roachprod 1327 start." 1328 `, 1329 Args: cobra.ExactArgs(1), 1330 Run: wrap(func(cmd *cobra.Command, args []string) error { 1331 c, err := newCluster(args[0]) 1332 if err != nil { 1333 return err 1334 } 1335 c.DistributeCerts() 1336 return nil 1337 }), 1338 } 1339 1340 var putCmd = &cobra.Command{ 1341 Use: "put <cluster> <src> [<dest>]", 1342 Short: "copy a local file to the nodes in a cluster", 1343 Long: `Copy a local file to the nodes in a cluster. 1344 `, 1345 Args: cobra.RangeArgs(2, 3), 1346 Run: wrap(func(cmd *cobra.Command, args []string) error { 1347 src := args[1] 1348 dest := path.Base(src) 1349 if len(args) == 3 { 1350 dest = args[2] 1351 } 1352 c, err := newCluster(args[0]) 1353 if err != nil { 1354 return err 1355 } 1356 c.Put(src, dest) 1357 return nil 1358 }), 1359 } 1360 1361 var getCmd = &cobra.Command{ 1362 Use: "get <cluster> <src> [<dest>]", 1363 Short: "copy a remote file from the nodes in a cluster", 1364 Long: `Copy a remote file from the nodes in a cluster. If the file is retrieved from 1365 multiple nodes the destination file name will be prefixed with the node number. 1366 `, 1367 Args: cobra.RangeArgs(2, 3), 1368 Run: wrap(func(cmd *cobra.Command, args []string) error { 1369 src := args[1] 1370 dest := path.Base(src) 1371 if len(args) == 3 { 1372 dest = args[2] 1373 } 1374 c, err := newCluster(args[0]) 1375 if err != nil { 1376 return err 1377 } 1378 c.Get(src, dest) 1379 return nil 1380 }), 1381 } 1382 1383 var sqlCmd = &cobra.Command{ 1384 Use: "sql <cluster> -- [args]", 1385 Short: "run `cockroach sql` on a remote cluster", 1386 Long: "Run `cockroach sql` on a remote cluster.\n", 1387 Args: cobra.MinimumNArgs(1), 1388 Run: wrap(func(cmd *cobra.Command, args []string) error { 1389 c, err := newCluster(args[0]) 1390 if err != nil { 1391 return err 1392 } 1393 cockroach, ok := c.Impl.(install.Cockroach) 1394 if !ok { 1395 return errors.New("sql is only valid on cockroach clusters") 1396 } 1397 return cockroach.SQL(c, args[1:]) 1398 }), 1399 } 1400 1401 var pgurlCmd = &cobra.Command{ 1402 Use: "pgurl <cluster>", 1403 Short: "generate pgurls for the nodes in a cluster", 1404 Long: `Generate pgurls for the nodes in a cluster. 1405 `, 1406 Args: cobra.ExactArgs(1), 1407 Run: wrap(func(cmd *cobra.Command, args []string) error { 1408 c, err := newCluster(args[0]) 1409 if err != nil { 1410 return err 1411 } 1412 nodes := c.ServerNodes() 1413 ips := make([]string, len(nodes)) 1414 1415 if external { 1416 for i := 0; i < len(nodes); i++ { 1417 ips[i] = c.VMs[nodes[i]-1] 1418 } 1419 } else { 1420 c.Parallel("", len(nodes), 0, func(i int) ([]byte, error) { 1421 var err error 1422 ips[i], err = c.GetInternalIP(nodes[i]) 1423 return nil, err 1424 }) 1425 } 1426 1427 var urls []string 1428 for i, ip := range ips { 1429 if ip == "" { 1430 return errors.Errorf("empty ip: %v", ips) 1431 } 1432 urls = append(urls, c.Impl.NodeURL(c, ip, c.Impl.NodePort(c, nodes[i]))) 1433 } 1434 fmt.Println(strings.Join(urls, " ")) 1435 if len(urls) != len(nodes) { 1436 return errors.Errorf("have nodes %v, but urls %v from ips %v", nodes, urls, ips) 1437 } 1438 return nil 1439 }), 1440 } 1441 1442 var adminurlCmd = &cobra.Command{ 1443 Use: "adminurl <cluster>", 1444 Aliases: []string{"admin", "adminui"}, 1445 Short: "generate admin UI URLs for the nodes in a cluster\n", 1446 Long: `Generate admin UI URLs for the nodes in a cluster. 1447 `, 1448 Args: cobra.ExactArgs(1), 1449 Run: wrap(func(cmd *cobra.Command, args []string) error { 1450 c, err := newCluster(args[0]) 1451 if err != nil { 1452 return err 1453 } 1454 1455 for i, node := range c.ServerNodes() { 1456 host := vm.Name(c.Name, node) + "." + gce.Subdomain 1457 1458 // verify DNS is working / fallback to IPs if not. 1459 if i == 0 && !adminurlIPs { 1460 if _, err := net.LookupHost(host); err != nil { 1461 fmt.Fprintf(os.Stderr, "no valid DNS (yet?). might need to re-run `sync`?\n") 1462 adminurlIPs = true 1463 } 1464 } 1465 1466 if adminurlIPs { 1467 host = c.VMs[node-1] 1468 } 1469 port := install.GetAdminUIPort(c.Impl.NodePort(c, node)) 1470 scheme := "http" 1471 if c.Secure { 1472 scheme = "https" 1473 } 1474 if !strings.HasPrefix(adminurlPath, "/") { 1475 adminurlPath = "/" + adminurlPath 1476 } 1477 url := fmt.Sprintf("%s://%s:%d%s", scheme, host, port, adminurlPath) 1478 if adminurlOpen { 1479 if err := exec.Command("python", "-m", "webbrowser", url).Run(); err != nil { 1480 return err 1481 } 1482 } else { 1483 fmt.Println(url) 1484 } 1485 } 1486 return nil 1487 }), 1488 } 1489 1490 var ipCmd = &cobra.Command{ 1491 Use: "ip <cluster>", 1492 Short: "get the IP addresses of the nodes in a cluster", 1493 Long: `Get the IP addresses of the nodes in a cluster. 1494 `, 1495 Args: cobra.ExactArgs(1), 1496 Run: wrap(func(cmd *cobra.Command, args []string) error { 1497 c, err := newCluster(args[0]) 1498 if err != nil { 1499 return err 1500 } 1501 1502 nodes := c.ServerNodes() 1503 ips := make([]string, len(nodes)) 1504 1505 if external { 1506 for i := 0; i < len(nodes); i++ { 1507 ips[i] = c.VMs[nodes[i]-1] 1508 } 1509 } else { 1510 c.Parallel("", len(nodes), 0, func(i int) ([]byte, error) { 1511 var err error 1512 ips[i], err = c.GetInternalIP(nodes[i]) 1513 return nil, err 1514 }) 1515 } 1516 1517 for _, ip := range ips { 1518 fmt.Println(ip) 1519 } 1520 return nil 1521 }), 1522 } 1523 1524 func main() { 1525 // The commands are displayed in the order they are added to rootCmd. Note 1526 // that gcCmd and adminurlCmd contain a trailing \n in their Short help in 1527 // order to separate the commands into logical groups. 1528 cobra.EnableCommandSorting = false 1529 rootCmd.AddCommand( 1530 createCmd, 1531 destroyCmd, 1532 extendCmd, 1533 listCmd, 1534 syncCmd, 1535 gcCmd, 1536 setupSSHCmd, 1537 1538 statusCmd, 1539 monitorCmd, 1540 startCmd, 1541 stopCmd, 1542 runCmd, 1543 wipeCmd, 1544 reformatCmd, 1545 installCmd, 1546 distributeCertsCmd, 1547 putCmd, 1548 getCmd, 1549 stageCmd, 1550 sqlCmd, 1551 ipCmd, 1552 pgurlCmd, 1553 adminurlCmd, 1554 logsCmd, 1555 1556 cachedHostsCmd, 1557 ) 1558 rootCmd.BashCompletionFunction = fmt.Sprintf(`__custom_func() 1559 { 1560 # only complete the 2nd arg, e.g. adminurl <foo> 1561 if ! [ $c -eq 2 ]; then 1562 return 1563 fi 1564 1565 # don't complete commands which do not accept a cluster/host arg 1566 case ${last_command} in 1567 %s) 1568 return 1569 ;; 1570 esac 1571 1572 local hosts_out 1573 if hosts_out=$(roachprod cached-hosts --cluster="${cur}" 2>/dev/null); then 1574 COMPREPLY=( $( compgen -W "${hosts_out[*]}" -- "$cur" ) ) 1575 fi 1576 1577 }`, 1578 strings.Join(func(cmds ...*cobra.Command) (s []string) { 1579 for _, cmd := range cmds { 1580 s = append(s, fmt.Sprintf("%s_%s", rootCmd.Name(), cmd.Name())) 1581 } 1582 return s 1583 }(createCmd, listCmd, syncCmd, gcCmd), " | "), 1584 ) 1585 1586 rootCmd.PersistentFlags().BoolVarP( 1587 &quiet, "quiet", "q", false, "disable fancy progress output") 1588 rootCmd.PersistentFlags().IntVarP( 1589 &maxConcurrency, "max-concurrency", "", 32, 1590 "maximum number of operations to execute on nodes concurrently, set to zero for infinite") 1591 for _, cmd := range []*cobra.Command{createCmd, destroyCmd, extendCmd, logsCmd} { 1592 cmd.Flags().StringVarP(&username, "username", "u", os.Getenv("ROACHPROD_USER"), 1593 "Username to run under, detect if blank") 1594 } 1595 1596 for _, cmd := range []*cobra.Command{statusCmd, monitorCmd, startCmd, 1597 stopCmd, runCmd, wipeCmd, reformatCmd, installCmd, putCmd, getCmd, 1598 sqlCmd, pgurlCmd, adminurlCmd, ipCmd, 1599 } { 1600 cmd.Flags().BoolVar( 1601 &ssh.InsecureIgnoreHostKey, "insecure-ignore-host-key", true, "don't check ssh host keys") 1602 } 1603 1604 createCmd.Flags().DurationVarP(&createVMOpts.Lifetime, 1605 "lifetime", "l", 12*time.Hour, "Lifetime of the cluster") 1606 createCmd.Flags().BoolVar(&createVMOpts.SSDOpts.UseLocalSSD, 1607 "local-ssd", true, "Use local SSD") 1608 createCmd.Flags().BoolVar(&createVMOpts.SSDOpts.NoExt4Barrier, 1609 "local-ssd-no-ext4-barrier", true, 1610 `Mount the local SSD with the "-o nobarrier" flag. `+ 1611 `Ignored if --local-ssd=false is specified.`) 1612 createCmd.Flags().IntVarP(&numNodes, 1613 "nodes", "n", 4, "Total number of nodes, distributed across all clouds") 1614 createCmd.Flags().StringSliceVarP(&createVMOpts.VMProviders, 1615 "clouds", "c", []string{gce.ProviderName}, 1616 fmt.Sprintf("The cloud provider(s) to use when creating new vm instances: %s", vm.AllProviderNames())) 1617 createCmd.Flags().BoolVar(&createVMOpts.GeoDistributed, 1618 "geo", false, "Create geo-distributed cluster") 1619 // Allow each Provider to inject additional configuration flags 1620 for _, p := range vm.Providers { 1621 p.Flags().ConfigureCreateFlags(createCmd.Flags()) 1622 1623 for _, cmd := range []*cobra.Command{ 1624 destroyCmd, extendCmd, listCmd, syncCmd, gcCmd, 1625 } { 1626 p.Flags().ConfigureClusterFlags(cmd.Flags(), vm.AcceptMultipleProjects) 1627 } 1628 // createCmd only accepts a single GCE project, as opposed to all the other 1629 // commands. 1630 p.Flags().ConfigureClusterFlags(createCmd.Flags(), vm.SingleProject) 1631 } 1632 1633 destroyCmd.Flags().BoolVarP(&destroyAllMine, 1634 "all-mine", "m", false, "Destroy all clusters belonging to the current user") 1635 1636 extendCmd.Flags().DurationVarP(&extendLifetime, 1637 "lifetime", "l", 12*time.Hour, "Lifetime of the cluster") 1638 1639 listCmd.Flags().BoolVarP(&listDetails, 1640 "details", "d", false, "Show cluster details") 1641 listCmd.Flags().BoolVar(&listJSON, 1642 "json", false, "Show cluster specs in a json format") 1643 listCmd.Flags().BoolVarP(&listMine, 1644 "mine", "m", false, "Show only clusters belonging to the current user") 1645 1646 adminurlCmd.Flags().BoolVar( 1647 &adminurlOpen, `open`, false, `Open the url in a browser`) 1648 adminurlCmd.Flags().StringVar( 1649 &adminurlPath, `path`, "/", `Path to add to URL (e.g. to open a same page on each node)`) 1650 adminurlCmd.Flags().BoolVar( 1651 &adminurlIPs, `ips`, false, `Use Public IPs instead of DNS names in URL`) 1652 1653 gcCmd.Flags().BoolVarP( 1654 &dryrun, "dry-run", "n", dryrun, "dry run (don't perform any actions)") 1655 gcCmd.Flags().StringVar(&config.SlackToken, "slack-token", "", "Slack bot token") 1656 1657 pgurlCmd.Flags().BoolVar( 1658 &external, "external", false, "return pgurls for external connections") 1659 1660 ipCmd.Flags().BoolVar( 1661 &external, "external", false, "return external IP addresses") 1662 1663 runCmd.Flags().BoolVar( 1664 &secure, "secure", false, "use a secure cluster") 1665 1666 startCmd.Flags().IntVarP(&numRacks, 1667 "racks", "r", 0, "the number of racks to partition the nodes into") 1668 1669 stopCmd.Flags().IntVar(&sig, "sig", sig, "signal to pass to kill") 1670 stopCmd.Flags().BoolVar(&waitFlag, "wait", waitFlag, "wait for processes to exit") 1671 1672 wipeCmd.Flags().BoolVar(&wipePreserveCerts, "preserve-certs", false, "do not wipe certificates") 1673 1674 for _, cmd := range []*cobra.Command{ 1675 startCmd, statusCmd, stopCmd, runCmd, 1676 } { 1677 cmd.Flags().StringVar( 1678 &tag, "tag", "", "the process tag") 1679 } 1680 1681 for _, cmd := range []*cobra.Command{ 1682 startCmd, putCmd, getCmd, 1683 } { 1684 cmd.Flags().BoolVar(new(bool), "scp", false, "DEPRECATED") 1685 _ = cmd.Flags().MarkDeprecated("scp", "always true") 1686 } 1687 1688 putCmd.Flags().BoolVar(&useTreeDist, "treedist", useTreeDist, "use treedist copy algorithm") 1689 1690 stageCmd.Flags().StringVar(&stageOS, "os", "", "operating system override for staged binaries") 1691 1692 logsCmd.Flags().StringVar( 1693 &logsFilter, "filter", "", "re to filter log messages") 1694 logsCmd.Flags().Var( 1695 flagutil.Time(&logsFrom), "from", "time from which to stream logs") 1696 logsCmd.Flags().Var( 1697 flagutil.Time(&logsTo), "to", "time to which to stream logs") 1698 logsCmd.Flags().DurationVar( 1699 &logsInterval, "interval", 200*time.Millisecond, "interval to poll logs from host") 1700 logsCmd.Flags().StringVar( 1701 &logsDir, "logs-dir", "logs", "path to the logs dir, if remote, relative to username's home dir, ignored if local") 1702 logsCmd.Flags().StringVar( 1703 &logsProgramFilter, "logs-program", "^cockroach$", "regular expression of the name of program in log files to search") 1704 1705 monitorCmd.Flags().BoolVar( 1706 &monitorIgnoreEmptyNodes, 1707 "ignore-empty-nodes", 1708 false, 1709 "Automatically detect the (subset of the given) nodes which to monitor "+ 1710 "based on the presence of a nontrivial data directory.") 1711 1712 monitorCmd.Flags().BoolVar( 1713 &monitorOneShot, 1714 "oneshot", 1715 false, 1716 "Report the status of all targeted nodes once, then exit. The exit "+ 1717 "status is nonzero if (and only if) any node was found not running.") 1718 1719 cachedHostsCmd.Flags().StringVar(&cachedHostsCluster, "cluster", "", "print hosts matching cluster") 1720 1721 for _, cmd := range []*cobra.Command{ 1722 getCmd, putCmd, runCmd, startCmd, statusCmd, stopCmd, 1723 wipeCmd, pgurlCmd, adminurlCmd, sqlCmd, installCmd, 1724 } { 1725 switch cmd { 1726 case startCmd: 1727 cmd.Flags().BoolVar( 1728 &install.StartOpts.Sequential, "sequential", true, 1729 "start nodes sequentially so node IDs match hostnames") 1730 cmd.Flags().StringArrayVarP( 1731 &nodeArgs, "args", "a", nil, "node arguments") 1732 cmd.Flags().StringVarP( 1733 &nodeEnv, "env", "e", nodeEnv, "node environment variables") 1734 cmd.Flags().StringVarP( 1735 &clusterType, "type", "t", clusterType, `cluster type ("cockroach" or "cassandra")`) 1736 cmd.Flags().BoolVar( 1737 &install.StartOpts.Encrypt, "encrypt", encrypt, "start nodes with encryption at rest turned on") 1738 fallthrough 1739 case sqlCmd: 1740 cmd.Flags().StringVarP( 1741 &config.Binary, "binary", "b", config.Binary, 1742 "the remote cockroach binary to use") 1743 fallthrough 1744 case pgurlCmd, adminurlCmd: 1745 cmd.Flags().BoolVar( 1746 &secure, "secure", false, "use a secure cluster") 1747 } 1748 1749 if cmd.Long == "" { 1750 cmd.Long = cmd.Short 1751 } 1752 cmd.Long += fmt.Sprintf(` 1753 Node specification 1754 1755 By default the operation is performed on all nodes in <cluster>. A subset of 1756 nodes can be specified by appending :<nodes> to the cluster name. The syntax 1757 of <nodes> is a comma separated list of specific node IDs or range of 1758 IDs. For example: 1759 1760 roachprod %[1]s marc-test:1-3,8-9 1761 1762 will perform %[1]s on: 1763 1764 marc-test-1 1765 marc-test-2 1766 marc-test-3 1767 marc-test-8 1768 marc-test-9 1769 `, cmd.Name()) 1770 } 1771 1772 var err error 1773 config.OSUser, err = user.Current() 1774 if err != nil { 1775 fmt.Fprintf(os.Stderr, "unable to lookup current user: %s\n", err) 1776 os.Exit(1) 1777 } 1778 1779 if err := initDirs(); err != nil { 1780 fmt.Fprintf(os.Stderr, "%s\n", err) 1781 os.Exit(1) 1782 } 1783 1784 if err := loadClusters(); err != nil { 1785 // We don't want to exit as we may be looking at the help message. 1786 fmt.Printf("problem loading clusters: %s\n", err) 1787 } 1788 1789 if err := rootCmd.Execute(); err != nil { 1790 // Cobra has already printed the error message. 1791 os.Exit(1) 1792 } 1793 }