github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/main.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"encoding/json"
    15  	"fmt"
    16  	"log"
    17  	"net"
    18  	"os"
    19  	"os/exec"
    20  	"os/user"
    21  	"path"
    22  	"path/filepath"
    23  	"regexp"
    24  	"runtime"
    25  	"sort"
    26  	"strings"
    27  	"text/tabwriter"
    28  	"time"
    29  
    30  	cld "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/cloud"
    31  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
    32  	rperrors "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/errors"
    33  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/install"
    34  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ssh"
    35  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ui"
    36  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm"
    37  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/aws"
    38  	_ "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/azure"
    39  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/gce"
    40  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm/local"
    41  	"github.com/cockroachdb/cockroach/pkg/util/flagutil"
    42  	"github.com/cockroachdb/errors"
    43  	"github.com/spf13/cobra"
    44  	"golang.org/x/crypto/ssh/terminal"
    45  	"golang.org/x/sys/unix"
    46  )
    47  
    48  var rootCmd = &cobra.Command{
    49  	Use:   "roachprod [command] (flags)",
    50  	Short: "roachprod tool for manipulating test clusters",
    51  	Long: `roachprod is a tool for manipulating ephemeral test clusters, allowing easy
    52  creating, destruction, starting, stopping and wiping of clusters along with
    53  running load generators.
    54  
    55  Examples:
    56  
    57    roachprod create local -n 3
    58    roachprod start local
    59    roachprod sql local:2 -- -e "select * from crdb_internal.node_runtime_info"
    60    roachprod stop local
    61    roachprod wipe local
    62    roachprod destroy local
    63  
    64  The above commands will create a "local" 3 node cluster, start a cockroach
    65  cluster on these nodes, run a sql command on the 2nd node, stop, wipe and
    66  destroy the cluster.
    67  `,
    68  }
    69  
    70  var (
    71  	numNodes          int
    72  	numRacks          int
    73  	username          string
    74  	dryrun            bool
    75  	destroyAllMine    bool
    76  	extendLifetime    time.Duration
    77  	wipePreserveCerts bool
    78  	listDetails       bool
    79  	listJSON          bool
    80  	listMine          bool
    81  	clusterType       = "cockroach"
    82  	secure            = false
    83  	nodeEnv           = "COCKROACH_ENABLE_RPC_COMPRESSION=false"
    84  	nodeArgs          []string
    85  	tag               string
    86  	external          = false
    87  	adminurlOpen      = false
    88  	adminurlPath      = ""
    89  	adminurlIPs       = false
    90  	useTreeDist       = true
    91  	encrypt           = false
    92  	quiet             = false
    93  	sig               = 9
    94  	waitFlag          = false
    95  	stageOS           string
    96  	logsDir           string
    97  	logsFilter        string
    98  	logsProgramFilter string
    99  	logsFrom          time.Time
   100  	logsTo            time.Time
   101  	logsInterval      time.Duration
   102  	maxConcurrency    int
   103  
   104  	monitorIgnoreEmptyNodes bool
   105  	monitorOneShot          bool
   106  
   107  	cachedHostsCluster string
   108  )
   109  
   110  func sortedClusters() []string {
   111  	var r []string
   112  	for n := range install.Clusters {
   113  		r = append(r, n)
   114  	}
   115  	sort.Strings(r)
   116  	return r
   117  }
   118  
   119  func newCluster(name string) (*install.SyncedCluster, error) {
   120  	nodeNames := "all"
   121  	{
   122  		parts := strings.Split(name, ":")
   123  		switch len(parts) {
   124  		case 2:
   125  			nodeNames = parts[1]
   126  			fallthrough
   127  		case 1:
   128  			name = parts[0]
   129  		case 0:
   130  			return nil, fmt.Errorf("no cluster specified")
   131  		default:
   132  			return nil, fmt.Errorf("invalid cluster name: %s", name)
   133  		}
   134  	}
   135  
   136  	c, ok := install.Clusters[name]
   137  	if !ok {
   138  		err := errors.Newf(`unknown cluster: %s`, name)
   139  		err = errors.WithHintf(err, `
   140  Available clusters:
   141    %s
   142  `, strings.Join(sortedClusters(), "\n  "))
   143  		err = errors.WithHint(err, `Use "roachprod sync" to update the list of available clusters.`)
   144  		return nil, err
   145  	}
   146  
   147  	switch clusterType {
   148  	case "cockroach":
   149  		c.Impl = install.Cockroach{}
   150  		if numRacks > 0 {
   151  			for i := range c.Localities {
   152  				rack := fmt.Sprintf("rack=%d", i%numRacks)
   153  				if c.Localities[i] != "" {
   154  					rack = "," + rack
   155  				}
   156  				c.Localities[i] += rack
   157  			}
   158  		}
   159  	case "cassandra":
   160  		c.Impl = install.Cassandra{}
   161  	default:
   162  		return nil, fmt.Errorf("unknown cluster type: %s", clusterType)
   163  	}
   164  
   165  	nodes, err := install.ListNodes(nodeNames, len(c.VMs))
   166  	if err != nil {
   167  		return nil, err
   168  	}
   169  	for _, n := range nodes {
   170  		if n > len(c.VMs) {
   171  			return nil, fmt.Errorf("invalid node spec %s, cluster contains %d nodes",
   172  				nodeNames, len(c.VMs))
   173  		}
   174  	}
   175  	c.Nodes = nodes
   176  	c.Secure = secure
   177  	c.Env = nodeEnv
   178  	c.Args = nodeArgs
   179  	if tag != "" {
   180  		c.Tag = "/" + tag
   181  	}
   182  	c.UseTreeDist = useTreeDist
   183  	c.Quiet = quiet || !terminal.IsTerminal(int(os.Stdout.Fd()))
   184  	c.MaxConcurrency = maxConcurrency
   185  	return c, nil
   186  }
   187  
   188  // verifyClusterName ensures that the given name conforms to
   189  // our naming pattern of "<username>-<clustername>". The
   190  // username must match one of the vm.Provider account names
   191  // or the --username override.
   192  func verifyClusterName(clusterName string) (string, error) {
   193  	if len(clusterName) == 0 {
   194  		return "", fmt.Errorf("cluster name cannot be blank")
   195  	}
   196  	if clusterName == config.Local {
   197  		return clusterName, nil
   198  	}
   199  
   200  	alphaNum, err := regexp.Compile(`^[a-zA-Z0-9\-]+$`)
   201  	if err != nil {
   202  		return "", err
   203  	}
   204  	if !alphaNum.MatchString(clusterName) {
   205  		return "", errors.Errorf("cluster name must match %s", alphaNum.String())
   206  	}
   207  
   208  	// Use the vm.Provider account names, or --username.
   209  	var accounts []string
   210  	if len(username) > 0 {
   211  		accounts = []string{username}
   212  	} else {
   213  		seenAccounts := map[string]bool{}
   214  		active, err := vm.FindActiveAccounts()
   215  		if err != nil {
   216  			return "", err
   217  		}
   218  		for _, account := range active {
   219  			if !seenAccounts[account] {
   220  				seenAccounts[account] = true
   221  				accounts = append(accounts, account)
   222  			}
   223  		}
   224  	}
   225  
   226  	// If we see <account>-<something>, accept it.
   227  	for _, account := range accounts {
   228  		if strings.HasPrefix(clusterName, account+"-") && len(clusterName) > len(account)+1 {
   229  			return clusterName, nil
   230  		}
   231  	}
   232  
   233  	// Try to pick out a reasonable cluster name from the input.
   234  	i := strings.Index(clusterName, "-")
   235  	suffix := clusterName
   236  	if i != -1 {
   237  		// The user specified a username prefix, but it didn't match an active
   238  		// account name. For example, assuming the account is "peter", `roachprod
   239  		// create joe-perf` should be specified as `roachprod create joe-perf -u
   240  		// joe`.
   241  		suffix = clusterName[i+1:]
   242  	} else {
   243  		// The user didn't specify a username prefix. For example, assuming the
   244  		// account is "peter", `roachprod create perf` should be specified as
   245  		// `roachprod create peter-perf`.
   246  		_ = 0
   247  	}
   248  
   249  	// Suggest acceptable cluster names.
   250  	var suggestions []string
   251  	for _, account := range accounts {
   252  		suggestions = append(suggestions, fmt.Sprintf("%s-%s", account, suffix))
   253  	}
   254  	return "", fmt.Errorf("malformed cluster name %s, did you mean one of %s",
   255  		clusterName, suggestions)
   256  }
   257  
   258  // Provide `cobra.Command` functions with a standard return code handler.
   259  // Exit codes come from rperrors.Error.ExitCode().
   260  //
   261  // If the wrapped error tree of an error does not contain an instance of
   262  // rperrors.Error, the error will automatically be wrapped with
   263  // rperrors.Unclassified.
   264  func wrap(f func(cmd *cobra.Command, args []string) error) func(cmd *cobra.Command, args []string) {
   265  	return func(cmd *cobra.Command, args []string) {
   266  		err := f(cmd, args)
   267  		if err != nil {
   268  			roachprodError, ok := rperrors.AsError(err)
   269  			if !ok {
   270  				roachprodError = rperrors.Unclassified{Err: err}
   271  				err = roachprodError
   272  			}
   273  
   274  			cmd.Printf("Error: %+v\n", err)
   275  
   276  			os.Exit(roachprodError.ExitCode())
   277  		}
   278  	}
   279  }
   280  
   281  var createVMOpts vm.CreateOpts
   282  
   283  type clusterAlreadyExistsError struct {
   284  	name string
   285  }
   286  
   287  func (e *clusterAlreadyExistsError) Error() string {
   288  	return fmt.Sprintf("cluster %s already exists", e.name)
   289  }
   290  
   291  func newClusterAlreadyExistsError(name string) error {
   292  	return &clusterAlreadyExistsError{name: name}
   293  }
   294  
   295  var createCmd = &cobra.Command{
   296  	Use:   "create <cluster>",
   297  	Short: "create a cluster",
   298  	Long: `Create a local or cloud-based cluster.
   299  
   300  A cluster is composed of a set of nodes, configured during cluster creation via
   301  the --nodes flag. Creating a cluster does not start any processes on the nodes
   302  other than the base system processes (e.g. sshd). See "roachprod start" for
   303  starting cockroach nodes and "roachprod {run,ssh}" for running arbitrary
   304  commands on the nodes of a cluster.
   305  
   306  Cloud Clusters
   307  
   308    Cloud-based clusters are ephemeral and come with a lifetime (specified by the
   309    --lifetime flag) after which they will be automatically
   310    destroyed. Cloud-based clusters require the associated command line tool for
   311    the cloud to be installed and configured (e.g. "gcloud auth login").
   312  
   313    Clusters names are required to be prefixed by the authenticated user of the
   314    cloud service. The suffix is an arbitrary string used to distinguish
   315    clusters. For example, "marc-test" is a valid cluster name for the user
   316    "marc". The authenticated user for the cloud service is automatically
   317    detected and can be override by the ROACHPROD_USER environment variable or
   318    the --username flag.
   319  
   320    The machine type and the use of local SSD storage can be specified during
   321    cluster creation via the --{cloud}-machine-type and --local-ssd flags. The
   322    machine-type is cloud specified. For example, --gce-machine-type=n1-highcpu-8
   323    requests the "n1-highcpu-8" machine type for a GCE-based cluster. No attempt
   324    is made (or desired) to abstract machine types across cloud providers. See
   325    the cloud provider's documentation for details on the machine types
   326    available.
   327  
   328  Local Clusters
   329  
   330    A local cluster stores the per-node data in ${HOME}/local on the machine
   331    roachprod is being run on. Local clusters requires local ssh access. Unlike
   332    cloud clusters there can be only a single local cluster, the local cluster is
   333    always named "local", and has no expiration (unlimited lifetime).
   334  `,
   335  	Args: cobra.ExactArgs(1),
   336  	Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
   337  		if numNodes <= 0 || numNodes >= 1000 {
   338  			// Upper limit is just for safety.
   339  			return fmt.Errorf("number of nodes must be in [1..999]")
   340  		}
   341  
   342  		clusterName, err := verifyClusterName(args[0])
   343  		if err != nil {
   344  			return err
   345  		}
   346  		createVMOpts.ClusterName = clusterName
   347  
   348  		defer func() {
   349  			if retErr == nil || clusterName == config.Local {
   350  				return
   351  			}
   352  			if errors.HasType(retErr, (*clusterAlreadyExistsError)(nil)) {
   353  				return
   354  			}
   355  			fmt.Fprintf(os.Stderr, "Cleaning up partially-created cluster (prev err: %s)\n", retErr)
   356  			if err := cleanupFailedCreate(clusterName); err != nil {
   357  				fmt.Fprintf(os.Stderr, "Error while cleaning up partially-created cluster: %s\n", err)
   358  			} else {
   359  				fmt.Fprintf(os.Stderr, "Cleaning up OK\n")
   360  			}
   361  		}()
   362  
   363  		if clusterName != config.Local {
   364  			cloud, err := cld.ListCloud()
   365  			if err != nil {
   366  				return err
   367  			}
   368  			if _, ok := cloud.Clusters[clusterName]; ok {
   369  				return newClusterAlreadyExistsError(clusterName)
   370  			}
   371  		} else {
   372  			if _, ok := install.Clusters[clusterName]; ok {
   373  				return newClusterAlreadyExistsError(clusterName)
   374  			}
   375  
   376  			// If the local cluster is being created, force the local Provider to be used
   377  			createVMOpts.VMProviders = []string{local.ProviderName}
   378  		}
   379  
   380  		fmt.Printf("Creating cluster %s with %d nodes\n", clusterName, numNodes)
   381  		if createErr := cld.CreateCluster(numNodes, createVMOpts); createErr != nil {
   382  			return createErr
   383  		}
   384  
   385  		// Just create directories for the local cluster as there's no need for ssh.
   386  		if clusterName == config.Local {
   387  			for i := 0; i < numNodes; i++ {
   388  				err := os.MkdirAll(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), i+1), 0755)
   389  				if err != nil {
   390  					return err
   391  				}
   392  			}
   393  			return nil
   394  		}
   395  		return setupSSH(clusterName)
   396  	}),
   397  }
   398  
   399  var setupSSHCmd = &cobra.Command{
   400  	Use:   "setup-ssh <cluster>",
   401  	Short: "set up ssh for a cluster",
   402  	Long: `Sets up the keys and host keys for the vms in the cluster.
   403  
   404  It first resets the machine credentials as though the cluster were newly created
   405  using the cloud provider APIs and then proceeds to ensure that the hosts can
   406  SSH into eachother and lastly adds additional public keys to AWS hosts as read
   407  from the GCP project. This operation is performed as the last step of creating
   408  a new cluster but can be useful to re-run if the operation failed previously or
   409  if the user would like to update the keys on the remote hosts.
   410  `,
   411  
   412  	Args: cobra.ExactArgs(1),
   413  	Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
   414  		clusterName, err := verifyClusterName(args[0])
   415  		if err != nil {
   416  			return err
   417  		}
   418  		return setupSSH(clusterName)
   419  	}),
   420  }
   421  
   422  func setupSSH(clusterName string) error {
   423  	cloud, err := syncCloud(quiet)
   424  	if err != nil {
   425  		return err
   426  	}
   427  	cloudCluster, ok := cloud.Clusters[clusterName]
   428  	if !ok {
   429  		return fmt.Errorf("could not find %s in list of cluster", clusterName)
   430  	}
   431  	cloudCluster.PrintDetails()
   432  	// Run ssh-keygen -R serially on each new VM in case an IP address has been recycled
   433  	for _, v := range cloudCluster.VMs {
   434  		cmd := exec.Command("ssh-keygen", "-R", v.PublicIP)
   435  		out, err := cmd.CombinedOutput()
   436  		if err != nil {
   437  			log.Printf("could not clear ssh key for hostname %s:\n%s", v.PublicIP, string(out))
   438  		}
   439  
   440  	}
   441  
   442  	// Wait for the nodes in the cluster to start.
   443  	install.Clusters = map[string]*install.SyncedCluster{}
   444  	if err := loadClusters(); err != nil {
   445  		return err
   446  	}
   447  	installCluster, err := newCluster(clusterName)
   448  	if err != nil {
   449  		return err
   450  	}
   451  	// For GCP clusters we need to use the config.OSUser even if the client
   452  	// requested the shared user.
   453  	for i := range installCluster.VMs {
   454  		if cloudCluster.VMs[i].Provider == gce.ProviderName {
   455  			installCluster.Users[i] = config.OSUser.Username
   456  		}
   457  	}
   458  	if err := installCluster.Wait(); err != nil {
   459  		return err
   460  	}
   461  	// Fetch public keys from gcloud to set up ssh access for all users into the
   462  	// shared ubuntu user.
   463  	installCluster.AuthorizedKeys, err = gce.GetUserAuthorizedKeys()
   464  	if err != nil {
   465  		return errors.Wrap(err, "failed to retrieve authorized keys from gcloud")
   466  	}
   467  	return installCluster.SetupSSH()
   468  }
   469  
   470  func cleanupFailedCreate(clusterName string) error {
   471  	cloud, err := cld.ListCloud()
   472  	if err != nil {
   473  		return err
   474  	}
   475  	c, ok := cloud.Clusters[clusterName]
   476  	if !ok {
   477  		// If the cluster doesn't exist, we didn't manage to create any VMs
   478  		// before failing. Not an error.
   479  		return nil
   480  	}
   481  	return cld.DestroyCluster(c)
   482  }
   483  
   484  var destroyCmd = &cobra.Command{
   485  	Use:   "destroy [ --all-mine | <cluster 1> [<cluster 2> ...] ]",
   486  	Short: "destroy clusters",
   487  	Long: `Destroy one or more local or cloud-based clusters.
   488  
   489  The destroy command accepts the names of the clusters to destroy. Alternatively,
   490  the --all-mine flag can be provided to destroy all clusters that are owned by the
   491  current user.
   492  
   493  Destroying a cluster releases the resources for a cluster. For a cloud-based
   494  cluster the machine and associated disk resources are freed. For a local
   495  cluster, any processes started by roachprod are stopped, and the ${HOME}/local
   496  directory is removed.
   497  `,
   498  	Args: cobra.ArbitraryArgs,
   499  	Run: wrap(func(cmd *cobra.Command, args []string) error {
   500  		switch len(args) {
   501  		case 0:
   502  			if !destroyAllMine {
   503  				return errors.New("no cluster name provided")
   504  			}
   505  
   506  			destroyPattern, err := userClusterNameRegexp()
   507  			if err != nil {
   508  				return err
   509  			}
   510  
   511  			cloud, err := cld.ListCloud()
   512  			if err != nil {
   513  				return err
   514  			}
   515  
   516  			var names []string
   517  			for name := range cloud.Clusters {
   518  				if destroyPattern.MatchString(name) {
   519  					names = append(names, name)
   520  				}
   521  			}
   522  			sort.Strings(names)
   523  
   524  			for _, clusterName := range names {
   525  				if err := destroyCluster(cloud, clusterName); err != nil {
   526  					return err
   527  				}
   528  			}
   529  		default:
   530  			if destroyAllMine {
   531  				return errors.New("--all-mine cannot be combined with cluster names")
   532  			}
   533  
   534  			var cloud *cld.Cloud
   535  			for _, arg := range args {
   536  				clusterName, err := verifyClusterName(arg)
   537  				if err != nil {
   538  					return err
   539  				}
   540  
   541  				if clusterName != config.Local {
   542  					if cloud == nil {
   543  						cloud, err = cld.ListCloud()
   544  						if err != nil {
   545  							return err
   546  						}
   547  					}
   548  
   549  					if err := destroyCluster(cloud, clusterName); err != nil {
   550  						return err
   551  					}
   552  				} else {
   553  					if err := destroyLocalCluster(); err != nil {
   554  						return err
   555  					}
   556  				}
   557  			}
   558  		}
   559  		fmt.Println("OK")
   560  		return nil
   561  	}),
   562  }
   563  
   564  func destroyCluster(cloud *cld.Cloud, clusterName string) error {
   565  	c, ok := cloud.Clusters[clusterName]
   566  	if !ok {
   567  		return fmt.Errorf("cluster %s does not exist", clusterName)
   568  	}
   569  	fmt.Printf("Destroying cluster %s with %d nodes\n", clusterName, len(c.VMs))
   570  	return cld.DestroyCluster(c)
   571  }
   572  
   573  func destroyLocalCluster() error {
   574  	if _, ok := install.Clusters[config.Local]; !ok {
   575  		return fmt.Errorf("cluster %s does not exist", config.Local)
   576  	}
   577  	c, err := newCluster(config.Local)
   578  	if err != nil {
   579  		return err
   580  	}
   581  	c.Wipe(false)
   582  	for _, i := range c.Nodes {
   583  		err := os.RemoveAll(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), i))
   584  		if err != nil {
   585  			return err
   586  		}
   587  	}
   588  	return os.Remove(filepath.Join(os.ExpandEnv(config.DefaultHostDir), c.Name))
   589  }
   590  
   591  var cachedHostsCmd = &cobra.Command{
   592  	Use:   "cached-hosts",
   593  	Short: "list all clusters (and optionally their host numbers) from local cache",
   594  	Args:  cobra.NoArgs,
   595  	Run: wrap(func(cmd *cobra.Command, args []string) error {
   596  		if err := loadClusters(); err != nil {
   597  			return err
   598  		}
   599  
   600  		names := make([]string, 0, len(install.Clusters))
   601  		for name := range install.Clusters {
   602  			names = append(names, name)
   603  		}
   604  		sort.Strings(names)
   605  
   606  		for _, name := range names {
   607  			c := install.Clusters[name]
   608  			if strings.HasPrefix(c.Name, "teamcity") {
   609  				continue
   610  			}
   611  			fmt.Print(c.Name)
   612  			// when invokved by bash-completion, cachedHostsCluster is what the user
   613  			// has currently typed -- if this cluster matches that, expand its hosts.
   614  			if strings.HasPrefix(cachedHostsCluster, c.Name) {
   615  				for i := range c.VMs {
   616  					fmt.Printf(" %s:%d", c.Name, i+1)
   617  				}
   618  			}
   619  			fmt.Println()
   620  		}
   621  		return nil
   622  	}),
   623  }
   624  
   625  var listCmd = &cobra.Command{
   626  	Use:   "list [--details] [ --mine | <cluster name regex> ]",
   627  	Short: "list all clusters",
   628  	Long: `List all clusters.
   629  
   630  The list command accepts an optional positional argument, which is a regular
   631  expression that will be matched against the cluster name pattern.  Alternatively,
   632  the --mine flag can be provided to list the clusters that are owned by the current
   633  user.
   634  
   635  The default output shows one line per cluster, including the local cluster if
   636  it exists:
   637  
   638    ~ roachprod list
   639    local:     [local]    1  (-)
   640    marc-test: [aws gce]  4  (5h34m35s)
   641    Syncing...
   642  
   643  The second column lists the cloud providers that host VMs for the cluster.
   644  
   645  The third and fourth columns are the number of nodes in the cluster and the
   646  time remaining before the cluster will be automatically destroyed. Note that
   647  local clusters do not have an expiration.
   648  
   649  The --details flag adjusts the output format to include per-node details:
   650  
   651    ~ roachprod list --details
   652    local [local]: (no expiration)
   653      localhost		127.0.0.1	127.0.0.1
   654    marc-test: [aws gce] 5h33m57s remaining
   655      marc-test-0001	marc-test-0001.us-east1-b.cockroach-ephemeral	10.142.0.18	35.229.60.91
   656      marc-test-0002	marc-test-0002.us-east1-b.cockroach-ephemeral	10.142.0.17	35.231.0.44
   657      marc-test-0003	marc-test-0003.us-east1-b.cockroach-ephemeral	10.142.0.19	35.229.111.100
   658      marc-test-0004	marc-test-0004.us-east1-b.cockroach-ephemeral	10.142.0.20	35.231.102.125
   659    Syncing...
   660  
   661  The first and second column are the node hostname and fully qualified name
   662  respectively. The third and fourth column are the private and public IP
   663  addresses.
   664  
   665  The --json flag sets the format of the command output to json.
   666  
   667  Listing clusters has the side-effect of syncing ssh keys/configs and the local
   668  hosts file.
   669  `,
   670  	Args: cobra.RangeArgs(0, 1),
   671  	Run: wrap(func(cmd *cobra.Command, args []string) error {
   672  		listPattern := regexp.MustCompile(".*")
   673  		switch len(args) {
   674  		case 0:
   675  			if listMine {
   676  				var err error
   677  				listPattern, err = userClusterNameRegexp()
   678  				if err != nil {
   679  					return err
   680  				}
   681  			}
   682  		case 1:
   683  			if listMine {
   684  				return errors.New("--mine cannot be combined with a pattern")
   685  			}
   686  			var err error
   687  			listPattern, err = regexp.Compile(args[0])
   688  			if err != nil {
   689  				return errors.Wrapf(err, "could not compile regex pattern: %s", args[0])
   690  			}
   691  		default:
   692  			return errors.New("only a single pattern may be listed")
   693  		}
   694  
   695  		cloud, err := syncCloud(quiet)
   696  		if err != nil {
   697  			return err
   698  		}
   699  
   700  		// Filter and sort by cluster names for stable output.
   701  		var names []string
   702  		filteredCloud := cloud.Clone()
   703  		for name := range cloud.Clusters {
   704  			if listPattern.MatchString(name) {
   705  				names = append(names, name)
   706  			} else {
   707  				delete(filteredCloud.Clusters, name)
   708  			}
   709  		}
   710  		sort.Strings(names)
   711  
   712  		if listJSON {
   713  			if listDetails {
   714  				return errors.New("--json cannot be combined with --detail")
   715  			}
   716  
   717  			enc := json.NewEncoder(os.Stdout)
   718  			enc.SetIndent("", "  ")
   719  			if err := enc.Encode(filteredCloud); err != nil {
   720  				return err
   721  			}
   722  		} else {
   723  			// Align columns left and separate with at least two spaces.
   724  			tw := tabwriter.NewWriter(os.Stdout, 0, 8, 2, ' ', 0)
   725  			for _, name := range names {
   726  				c := filteredCloud.Clusters[name]
   727  				if listDetails {
   728  					c.PrintDetails()
   729  				} else {
   730  					fmt.Fprintf(tw, "%s:\t%s\t%d", c.Name, c.Clouds(), len(c.VMs))
   731  					if !c.IsLocal() {
   732  						fmt.Fprintf(tw, "\t(%s)", c.LifetimeRemaining().Round(time.Second))
   733  					} else {
   734  						fmt.Fprintf(tw, "\t(-)")
   735  					}
   736  					fmt.Fprintf(tw, "\n")
   737  				}
   738  			}
   739  			if err := tw.Flush(); err != nil {
   740  				return err
   741  			}
   742  
   743  			// Optionally print any dangling instances with errors
   744  			if listDetails {
   745  				collated := filteredCloud.BadInstanceErrors()
   746  
   747  				// Sort by Error() value for stable output
   748  				var errors ui.ErrorsByError
   749  				for err := range collated {
   750  					errors = append(errors, err)
   751  				}
   752  				sort.Sort(errors)
   753  
   754  				for _, e := range errors {
   755  					fmt.Printf("%s: %s\n", e, collated[e].Names())
   756  				}
   757  			}
   758  		}
   759  		return nil
   760  	}),
   761  }
   762  
   763  // userClusterNameRegexp returns a regexp that matches all clusters owned by the
   764  // current user.
   765  func userClusterNameRegexp() (*regexp.Regexp, error) {
   766  	// In general, we expect that users will have the same
   767  	// account name across the services they're using,
   768  	// but we still want to function even if this is not
   769  	// the case.
   770  	seenAccounts := map[string]bool{}
   771  	accounts, err := vm.FindActiveAccounts()
   772  	if err != nil {
   773  		return nil, err
   774  	}
   775  	pattern := ""
   776  	for _, account := range accounts {
   777  		if !seenAccounts[account] {
   778  			seenAccounts[account] = true
   779  			if len(pattern) > 0 {
   780  				pattern += "|"
   781  			}
   782  			pattern += fmt.Sprintf("(^%s-)", regexp.QuoteMeta(account))
   783  		}
   784  	}
   785  	return regexp.Compile(pattern)
   786  }
   787  
   788  // TODO(peter): Do we need this command given that the "list" command syncs as
   789  // a side-effect. If you don't care about the list output, just "roachprod list
   790  // &>/dev/null".
   791  var syncCmd = &cobra.Command{
   792  	Use:   "sync",
   793  	Short: "sync ssh keys/config and hosts files",
   794  	Long:  ``,
   795  	Args:  cobra.NoArgs,
   796  	Run: wrap(func(cmd *cobra.Command, args []string) error {
   797  		_, err := syncCloud(quiet)
   798  		return err
   799  	}),
   800  }
   801  
   802  var lockFile = os.ExpandEnv("$HOME/.roachprod/LOCK")
   803  
   804  var bashCompletion = os.ExpandEnv("$HOME/.roachprod/bash-completion.sh")
   805  
   806  // syncCloud grabs an exclusive lock on the roachprod state and then proceeds to
   807  // read the current state from the cloud and write it out to disk. The locking
   808  // protects both the reading and the writing in order to prevent the hazard
   809  // caused by concurrent goroutines reading cloud state in a different order
   810  // than writing it to disk.
   811  func syncCloud(quiet bool) (*cld.Cloud, error) {
   812  	if !quiet {
   813  		fmt.Println("Syncing...")
   814  	}
   815  	// Acquire a filesystem lock so that two concurrent synchronizations of
   816  	// roachprod state don't clobber each other.
   817  	f, err := os.Create(lockFile)
   818  	if err != nil {
   819  		return nil, errors.Wrapf(err, "creating lock file %q", lockFile)
   820  	}
   821  	if err := unix.Flock(int(f.Fd()), unix.LOCK_EX); err != nil {
   822  		return nil, errors.Wrap(err, "acquiring lock on %q")
   823  	}
   824  	defer f.Close()
   825  	cloud, err := cld.ListCloud()
   826  	if err != nil {
   827  		return nil, err
   828  	}
   829  	if err := syncHosts(cloud); err != nil {
   830  		return nil, err
   831  	}
   832  
   833  	var vms vm.List
   834  	for _, c := range cloud.Clusters {
   835  		vms = append(vms, c.VMs...)
   836  	}
   837  
   838  	// Figure out if we're going to overwrite the DNS entries. We don't want to
   839  	// overwrite if we don't have all the VMs of interest, so we only do it if we
   840  	// have a list of all VMs from both AWS and GCE (so if both providers have
   841  	// been used to get the VMs and for GCP also if we listed the VMs in the
   842  	// default project).
   843  	refreshDNS := true
   844  
   845  	if p := vm.Providers[gce.ProviderName]; !p.Active() {
   846  		refreshDNS = false
   847  	} else {
   848  		var defaultProjectFound bool
   849  		for _, prj := range p.(*gce.Provider).GetProjects() {
   850  			if prj == gce.DefaultProject() {
   851  				defaultProjectFound = true
   852  				break
   853  			}
   854  		}
   855  		if !defaultProjectFound {
   856  			refreshDNS = false
   857  		}
   858  	}
   859  	if !vm.Providers[aws.ProviderName].Active() {
   860  		refreshDNS = false
   861  	}
   862  	// DNS entries are maintained in the GCE DNS registry for all vms, from all
   863  	// clouds.
   864  	if refreshDNS {
   865  		if !quiet {
   866  			fmt.Println("Refreshing DNS entries...")
   867  		}
   868  		if err := gce.SyncDNS(vms); err != nil {
   869  			fmt.Fprintf(os.Stderr, "failed to update %s DNS: %v", gce.Subdomain, err)
   870  		}
   871  	} else {
   872  		if !quiet {
   873  			fmt.Println("Not refreshing DNS entries. We did not have all the VMs.")
   874  		}
   875  	}
   876  
   877  	if err := vm.ProvidersSequential(vm.AllProviderNames(), func(p vm.Provider) error {
   878  		return p.CleanSSH()
   879  	}); err != nil {
   880  		return nil, err
   881  	}
   882  
   883  	_ = rootCmd.GenBashCompletionFile(bashCompletion)
   884  
   885  	if err := vm.ProvidersSequential(vm.AllProviderNames(), func(p vm.Provider) error {
   886  		return p.ConfigSSH()
   887  	}); err != nil {
   888  		return nil, err
   889  	}
   890  	return cloud, nil
   891  }
   892  
   893  var gcCmd = &cobra.Command{
   894  	Use:   "gc",
   895  	Short: "GC expired clusters\n",
   896  	Long: `Garbage collect expired clusters.
   897  
   898  Destroys expired clusters, sending email if properly configured. Usually run
   899  hourly by a cronjob so it is not necessary to run manually.
   900  `,
   901  	Args: cobra.NoArgs,
   902  	Run: wrap(func(cmd *cobra.Command, args []string) error {
   903  		cloud, err := cld.ListCloud()
   904  		if err != nil {
   905  			return err
   906  		}
   907  		return cld.GCClusters(cloud, dryrun)
   908  	}),
   909  }
   910  
   911  var extendCmd = &cobra.Command{
   912  	Use:   "extend <cluster>",
   913  	Short: "extend the lifetime of a cluster",
   914  	Long: `Extend the lifetime of the specified cluster to prevent it from being
   915  destroyed:
   916  
   917    roachprod extend marc-test --lifetime=6h
   918  `,
   919  	Args: cobra.ExactArgs(1),
   920  	Run: wrap(func(cmd *cobra.Command, args []string) error {
   921  		clusterName, err := verifyClusterName(args[0])
   922  		if err != nil {
   923  			return err
   924  		}
   925  
   926  		cloud, err := cld.ListCloud()
   927  		if err != nil {
   928  			return err
   929  		}
   930  
   931  		c, ok := cloud.Clusters[clusterName]
   932  		if !ok {
   933  			return fmt.Errorf("cluster %s does not exist", clusterName)
   934  		}
   935  
   936  		if err := cld.ExtendCluster(c, extendLifetime); err != nil {
   937  			return err
   938  		}
   939  
   940  		// Reload the clusters and print details.
   941  		cloud, err = cld.ListCloud()
   942  		if err != nil {
   943  			return err
   944  		}
   945  
   946  		c, ok = cloud.Clusters[clusterName]
   947  		if !ok {
   948  			return fmt.Errorf("cluster %s does not exist", clusterName)
   949  		}
   950  
   951  		c.PrintDetails()
   952  		return nil
   953  	}),
   954  }
   955  
   956  const tagHelp = `
   957  The --tag flag can be used to to associate a tag with the process. This tag can
   958  then be used to restrict the processes which are operated on by the status and
   959  stop commands. Tags can have a hierarchical component by utilizing a slash
   960  separated string similar to a filesystem path. A tag matches if a prefix of the
   961  components match. For example, the tag "a/b" will match both "a/b" and
   962  "a/b/c/d".
   963  `
   964  
   965  var startCmd = &cobra.Command{
   966  	Use:   "start <cluster>",
   967  	Short: "start nodes on a cluster",
   968  	Long: `Start nodes on a cluster.
   969  
   970  The --secure flag can be used to start nodes in secure mode (i.e. using
   971  certs). When specified, there is a one time initialization for the cluster to
   972  create and distribute the certs. Note that running some modes in secure mode
   973  and others in insecure mode is not a supported Cockroach configuration.
   974  
   975  As a debugging aid, the --sequential flag starts the nodes sequentially so node
   976  IDs match hostnames. Otherwise nodes are started are parallel.
   977  
   978  The --binary flag specifies the remote binary to run. It is up to the roachprod
   979  user to ensure this binary exists, usually via "roachprod put". Note that no
   980  cockroach software is installed by default on a newly created cluster.
   981  
   982  The --args and --env flags can be used to pass arbitrary command line flags and
   983  environment variables to the cockroach process.
   984  ` + tagHelp + `
   985  The "start" command takes care of setting up the --join address and specifying
   986  reasonable defaults for other flags. One side-effect of this convenience is
   987  that node 1 is special and must be started for the cluster to be initialized.
   988  
   989  If the COCKROACH_DEV_LICENSE environment variable is set the enterprise.license
   990  cluster setting will be set to its value.
   991  `,
   992  	Args: cobra.ExactArgs(1),
   993  	Run: wrap(func(cmd *cobra.Command, args []string) error {
   994  		c, err := newCluster(args[0])
   995  		if err != nil {
   996  			return err
   997  		}
   998  		c.Start()
   999  		return nil
  1000  	}),
  1001  }
  1002  
  1003  var stopCmd = &cobra.Command{
  1004  	Use:   "stop <cluster> [--sig] [--wait]",
  1005  	Short: "stop nodes on a cluster",
  1006  	Long: `Stop nodes on a cluster.
  1007  
  1008  Stop roachprod created processes running on the nodes in a cluster, including
  1009  processes started by the "start", "run" and "ssh" commands. Every process
  1010  started by roachprod is tagged with a ROACHPROD=<node> environment variable
  1011  which is used by "stop" to locate the processes and terminate them. By default
  1012  processes are killed with signal 9 (SIGKILL) giving them no chance for a graceful
  1013  exit.
  1014  
  1015  The --sig flag will pass a signal to kill to allow us finer control over how we
  1016  shutdown cockroach. The --wait flag causes stop to loop waiting for all
  1017  processes with the ROACHPROD=<node> environment variable to exit. Note that
  1018  stop will wait forever if you specify --wait with a non-terminating signal
  1019  (e.g. SIGHUP). --wait defaults to true for signal 9 (SIGKILL) and false for all
  1020  other signals.
  1021  ` + tagHelp + `
  1022  `,
  1023  	Args: cobra.ExactArgs(1),
  1024  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1025  		c, err := newCluster(args[0])
  1026  		if err != nil {
  1027  			return err
  1028  		}
  1029  		wait := waitFlag
  1030  		if sig == 9 /* SIGKILL */ && !cmd.Flags().Changed("wait") {
  1031  			wait = true
  1032  		}
  1033  		c.Stop(sig, wait)
  1034  		return nil
  1035  	}),
  1036  }
  1037  
  1038  var statusCmd = &cobra.Command{
  1039  	Use:   "status <cluster>",
  1040  	Short: "retrieve the status of nodes in a cluster",
  1041  	Long: `Retrieve the status of nodes in a cluster.
  1042  
  1043  The "status" command outputs the binary and PID for the specified nodes:
  1044  
  1045    ~ roachprod status local
  1046    local: status 3/3
  1047       1: cockroach 29688
  1048       2: cockroach 29687
  1049       3: cockroach 29689
  1050  ` + tagHelp + `
  1051  `,
  1052  	Args: cobra.ExactArgs(1),
  1053  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1054  		c, err := newCluster(args[0])
  1055  		if err != nil {
  1056  			return err
  1057  		}
  1058  		c.Status()
  1059  		return nil
  1060  	}),
  1061  }
  1062  
  1063  var logsCmd = &cobra.Command{
  1064  	Use:   "logs",
  1065  	Short: "retrieve and merge logs in a cluster",
  1066  	Long: `Retrieve and merge logs in a cluster.
  1067  
  1068  The "logs" command runs until terminated. It works similarly to get but is
  1069  specifically focused on retrieving logs periodically and then merging them
  1070  into a single stream.
  1071  `,
  1072  	Args: cobra.RangeArgs(1, 2),
  1073  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1074  		c, err := newCluster(args[0])
  1075  		if err != nil {
  1076  			return err
  1077  		}
  1078  		var dest string
  1079  		if len(args) == 2 {
  1080  			dest = args[1]
  1081  		} else {
  1082  			dest = c.Name + ".logs"
  1083  		}
  1084  		return c.Logs(logsDir, dest, username, logsFilter, logsProgramFilter, logsInterval, logsFrom, logsTo, cmd.OutOrStdout())
  1085  	}),
  1086  }
  1087  
  1088  var monitorCmd = &cobra.Command{
  1089  	Use:   "monitor",
  1090  	Short: "monitor the status of nodes in a cluster",
  1091  	Long: `Monitor the status of cockroach nodes in a cluster.
  1092  
  1093  The "monitor" command runs until terminated. At startup it outputs a line for
  1094  each specified node indicating the status of the node (either the PID of the
  1095  node if alive, or "dead" otherwise). It then watches for changes in the status
  1096  of nodes, outputting a line whenever a change is detected:
  1097  
  1098    ~ roachprod monitor local
  1099    1: 29688
  1100    3: 29689
  1101    2: 29687
  1102    3: dead
  1103    3: 30718
  1104  `,
  1105  	Args: cobra.ExactArgs(1),
  1106  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1107  		c, err := newCluster(args[0])
  1108  		if err != nil {
  1109  			return err
  1110  		}
  1111  		for msg := range c.Monitor(monitorIgnoreEmptyNodes, monitorOneShot) {
  1112  			if msg.Err != nil {
  1113  				msg.Msg += "error: " + msg.Err.Error()
  1114  			}
  1115  			thisError := errors.Newf("%d: %s", msg.Index, msg.Msg)
  1116  			if msg.Err != nil || strings.Contains(msg.Msg, "dead") {
  1117  				err = errors.CombineErrors(err, thisError)
  1118  			}
  1119  			fmt.Println(thisError.Error())
  1120  		}
  1121  		return err
  1122  	}),
  1123  }
  1124  
  1125  var wipeCmd = &cobra.Command{
  1126  	Use:   "wipe <cluster>",
  1127  	Short: "wipe a cluster",
  1128  	Long: `Wipe the nodes in a cluster.
  1129  
  1130  The "wipe" command first stops any processes running on the nodes in a cluster
  1131  (via the "stop" command) and then deletes the data directories used by the
  1132  nodes.
  1133  `,
  1134  	Args: cobra.ExactArgs(1),
  1135  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1136  		c, err := newCluster(args[0])
  1137  		if err != nil {
  1138  			return err
  1139  		}
  1140  		c.Wipe(wipePreserveCerts)
  1141  		return nil
  1142  	}),
  1143  }
  1144  
  1145  var reformatCmd = &cobra.Command{
  1146  	Use:   "reformat <cluster> <filesystem>",
  1147  	Short: "reformat disks in a cluster\n",
  1148  	Long: `
  1149  Reformat disks in a cluster to use the specified filesystem.
  1150  
  1151  WARNING: Reformatting will delete all existing data in the cluster.
  1152  
  1153  Filesystem options:
  1154    ext4
  1155    zfs
  1156  
  1157  When running with ZFS, you can create a snapshot of the filesystem's current
  1158  state using the 'zfs snapshot' command:
  1159  
  1160    $ roachprod run <cluster> 'sudo zfs snapshot data1@pristine'
  1161  
  1162  You can then nearly instantaneously restore the filesystem to this state with
  1163  the 'zfs rollback' command:
  1164  
  1165    $ roachprod run <cluster> 'sudo zfs rollback data1@pristine'
  1166  
  1167  `,
  1168  
  1169  	Args: cobra.ExactArgs(2),
  1170  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1171  		c, err := newCluster(args[0])
  1172  		if err != nil {
  1173  			return err
  1174  		}
  1175  
  1176  		var fsCmd string
  1177  		switch fs := args[1]; fs {
  1178  		case "zfs":
  1179  			if err := install.Install(c, []string{"zfs"}); err != nil {
  1180  				return err
  1181  			}
  1182  			fsCmd = `sudo zpool create -f data1 -m /mnt/data1 /dev/sdb`
  1183  		case "ext4":
  1184  			fsCmd = `sudo mkfs.ext4 -F /dev/sdb && sudo mount -o discard,defaults /dev/sdb /mnt/data1`
  1185  		default:
  1186  			return fmt.Errorf("unknown filesystem %q", fs)
  1187  		}
  1188  
  1189  		err = c.Run(os.Stdout, os.Stderr, c.Nodes, install.OtherCmd, "reformatting", fmt.Sprintf(`
  1190  set -euo pipefail
  1191  if sudo zpool list -Ho name 2>/dev/null | grep ^data1$; then
  1192    sudo zpool destroy -f data1
  1193  fi
  1194  if mountpoint -q /mnt/data1; then
  1195    sudo umount -f /mnt/data1
  1196  fi
  1197  %s
  1198  sudo chmod 777 /mnt/data1
  1199  `, fsCmd))
  1200  		if err != nil {
  1201  			fmt.Fprintf(os.Stderr, "%s\n", err)
  1202  		}
  1203  		return nil
  1204  	}),
  1205  }
  1206  
  1207  var runCmd = &cobra.Command{
  1208  	Use:     "run <cluster> <command> [args]",
  1209  	Aliases: []string{"ssh"},
  1210  	Short:   "run a command on the nodes in a cluster",
  1211  	Long: `Run a command on the nodes in a cluster.
  1212  `,
  1213  	Args: cobra.MinimumNArgs(1),
  1214  	Run: wrap(func(_ *cobra.Command, args []string) error {
  1215  		c, err := newCluster(args[0])
  1216  		if err != nil {
  1217  			return err
  1218  		}
  1219  
  1220  		// Use "ssh" if an interactive session was requested (i.e. there is no
  1221  		// remote command to run).
  1222  		if len(args) == 1 {
  1223  			return c.SSH(nil, args[1:])
  1224  		}
  1225  
  1226  		cmd := strings.TrimSpace(strings.Join(args[1:], " "))
  1227  		title := cmd
  1228  		if len(title) > 30 {
  1229  			title = title[:27] + "..."
  1230  		}
  1231  		return c.Run(os.Stdout, os.Stderr, c.Nodes, install.CockroachCmd, title, cmd)
  1232  	}),
  1233  }
  1234  
  1235  var installCmd = &cobra.Command{
  1236  	Use:   "install <cluster> <software>",
  1237  	Short: "install 3rd party software",
  1238  	Long: `Install third party software. Currently available installation options are:
  1239  
  1240      ` + strings.Join(install.SortedCmds(), "\n    ") + `
  1241  `,
  1242  	Args: cobra.MinimumNArgs(2),
  1243  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1244  		c, err := newCluster(args[0])
  1245  		if err != nil {
  1246  			return err
  1247  		}
  1248  		return install.Install(c, args[1:])
  1249  	}),
  1250  }
  1251  
  1252  var stageCmd = &cobra.Command{
  1253  	Use:   "stage <cluster> <application> [<sha/version>]",
  1254  	Short: "stage cockroach binaries",
  1255  	Long: `Stages release and edge binaries to the cluster.
  1256  
  1257  Currently available application options are:
  1258    cockroach - Cockroach Unofficial. Can provide an optional SHA, otherwise
  1259                latest build version is used.
  1260    workload  - Cockroach workload application.
  1261    release   - Official CockroachDB Release. Must provide a specific release
  1262                version.
  1263  
  1264  Some examples of usage:
  1265    -- stage edge build of cockroach build at a specific SHA:
  1266    roachprod stage my-cluster cockroach e90e6903fee7dd0f88e20e345c2ddfe1af1e5a97
  1267  
  1268    -- Stage the most recent edge build of the workload tool:
  1269    roachprod stage my-cluster workload
  1270  
  1271    -- Stage the official release binary of CockroachDB at version 2.0.5
  1272    roachprod stage my-cluster release v2.0.5
  1273  `,
  1274  	Args: cobra.RangeArgs(2, 3),
  1275  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1276  		c, err := newCluster(args[0])
  1277  		if err != nil {
  1278  			return err
  1279  		}
  1280  
  1281  		os := "linux"
  1282  		if stageOS != "" {
  1283  			os = stageOS
  1284  		} else if c.IsLocal() {
  1285  			os = runtime.GOOS
  1286  		}
  1287  		var debugArch, releaseArch string
  1288  		switch os {
  1289  		case "linux":
  1290  			debugArch, releaseArch = "linux-gnu-amd64", "linux-amd64"
  1291  		case "darwin":
  1292  			debugArch, releaseArch = "darwin-amd64", "darwin-10.9-amd64"
  1293  		case "windows":
  1294  			debugArch, releaseArch = "windows-amd64", "windows-6.2-amd64"
  1295  		default:
  1296  			return errors.Errorf("cannot stage binary on %s", os)
  1297  		}
  1298  
  1299  		applicationName := args[1]
  1300  		versionArg := ""
  1301  		if len(args) == 3 {
  1302  			versionArg = args[2]
  1303  		}
  1304  		switch applicationName {
  1305  		case "cockroach":
  1306  			return install.StageRemoteBinary(
  1307  				c, applicationName, "cockroach/cockroach", versionArg, debugArch,
  1308  			)
  1309  		case "workload":
  1310  			return install.StageRemoteBinary(
  1311  				c, applicationName, "cockroach/workload", versionArg, "", /* arch */
  1312  			)
  1313  		case "release":
  1314  			return install.StageCockroachRelease(c, versionArg, releaseArch)
  1315  		default:
  1316  			return fmt.Errorf("unknown application %s", applicationName)
  1317  		}
  1318  	}),
  1319  }
  1320  
  1321  var distributeCertsCmd = &cobra.Command{
  1322  	Use:   "distribute-certs <cluster>",
  1323  	Short: "distribute certificates to the nodes in a cluster",
  1324  	Long: `Distribute certificates to the nodes in a cluster.
  1325  If the certificates already exist, no action is taken. Note that this command is
  1326  invoked automatically when a secure cluster is bootstrapped by "roachprod
  1327  start."
  1328  `,
  1329  	Args: cobra.ExactArgs(1),
  1330  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1331  		c, err := newCluster(args[0])
  1332  		if err != nil {
  1333  			return err
  1334  		}
  1335  		c.DistributeCerts()
  1336  		return nil
  1337  	}),
  1338  }
  1339  
  1340  var putCmd = &cobra.Command{
  1341  	Use:   "put <cluster> <src> [<dest>]",
  1342  	Short: "copy a local file to the nodes in a cluster",
  1343  	Long: `Copy a local file to the nodes in a cluster.
  1344  `,
  1345  	Args: cobra.RangeArgs(2, 3),
  1346  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1347  		src := args[1]
  1348  		dest := path.Base(src)
  1349  		if len(args) == 3 {
  1350  			dest = args[2]
  1351  		}
  1352  		c, err := newCluster(args[0])
  1353  		if err != nil {
  1354  			return err
  1355  		}
  1356  		c.Put(src, dest)
  1357  		return nil
  1358  	}),
  1359  }
  1360  
  1361  var getCmd = &cobra.Command{
  1362  	Use:   "get <cluster> <src> [<dest>]",
  1363  	Short: "copy a remote file from the nodes in a cluster",
  1364  	Long: `Copy a remote file from the nodes in a cluster. If the file is retrieved from
  1365  multiple nodes the destination file name will be prefixed with the node number.
  1366  `,
  1367  	Args: cobra.RangeArgs(2, 3),
  1368  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1369  		src := args[1]
  1370  		dest := path.Base(src)
  1371  		if len(args) == 3 {
  1372  			dest = args[2]
  1373  		}
  1374  		c, err := newCluster(args[0])
  1375  		if err != nil {
  1376  			return err
  1377  		}
  1378  		c.Get(src, dest)
  1379  		return nil
  1380  	}),
  1381  }
  1382  
  1383  var sqlCmd = &cobra.Command{
  1384  	Use:   "sql <cluster> -- [args]",
  1385  	Short: "run `cockroach sql` on a remote cluster",
  1386  	Long:  "Run `cockroach sql` on a remote cluster.\n",
  1387  	Args:  cobra.MinimumNArgs(1),
  1388  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1389  		c, err := newCluster(args[0])
  1390  		if err != nil {
  1391  			return err
  1392  		}
  1393  		cockroach, ok := c.Impl.(install.Cockroach)
  1394  		if !ok {
  1395  			return errors.New("sql is only valid on cockroach clusters")
  1396  		}
  1397  		return cockroach.SQL(c, args[1:])
  1398  	}),
  1399  }
  1400  
  1401  var pgurlCmd = &cobra.Command{
  1402  	Use:   "pgurl <cluster>",
  1403  	Short: "generate pgurls for the nodes in a cluster",
  1404  	Long: `Generate pgurls for the nodes in a cluster.
  1405  `,
  1406  	Args: cobra.ExactArgs(1),
  1407  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1408  		c, err := newCluster(args[0])
  1409  		if err != nil {
  1410  			return err
  1411  		}
  1412  		nodes := c.ServerNodes()
  1413  		ips := make([]string, len(nodes))
  1414  
  1415  		if external {
  1416  			for i := 0; i < len(nodes); i++ {
  1417  				ips[i] = c.VMs[nodes[i]-1]
  1418  			}
  1419  		} else {
  1420  			c.Parallel("", len(nodes), 0, func(i int) ([]byte, error) {
  1421  				var err error
  1422  				ips[i], err = c.GetInternalIP(nodes[i])
  1423  				return nil, err
  1424  			})
  1425  		}
  1426  
  1427  		var urls []string
  1428  		for i, ip := range ips {
  1429  			if ip == "" {
  1430  				return errors.Errorf("empty ip: %v", ips)
  1431  			}
  1432  			urls = append(urls, c.Impl.NodeURL(c, ip, c.Impl.NodePort(c, nodes[i])))
  1433  		}
  1434  		fmt.Println(strings.Join(urls, " "))
  1435  		if len(urls) != len(nodes) {
  1436  			return errors.Errorf("have nodes %v, but urls %v from ips %v", nodes, urls, ips)
  1437  		}
  1438  		return nil
  1439  	}),
  1440  }
  1441  
  1442  var adminurlCmd = &cobra.Command{
  1443  	Use:     "adminurl <cluster>",
  1444  	Aliases: []string{"admin", "adminui"},
  1445  	Short:   "generate admin UI URLs for the nodes in a cluster\n",
  1446  	Long: `Generate admin UI URLs for the nodes in a cluster.
  1447  `,
  1448  	Args: cobra.ExactArgs(1),
  1449  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1450  		c, err := newCluster(args[0])
  1451  		if err != nil {
  1452  			return err
  1453  		}
  1454  
  1455  		for i, node := range c.ServerNodes() {
  1456  			host := vm.Name(c.Name, node) + "." + gce.Subdomain
  1457  
  1458  			// verify DNS is working / fallback to IPs if not.
  1459  			if i == 0 && !adminurlIPs {
  1460  				if _, err := net.LookupHost(host); err != nil {
  1461  					fmt.Fprintf(os.Stderr, "no valid DNS (yet?). might need to re-run `sync`?\n")
  1462  					adminurlIPs = true
  1463  				}
  1464  			}
  1465  
  1466  			if adminurlIPs {
  1467  				host = c.VMs[node-1]
  1468  			}
  1469  			port := install.GetAdminUIPort(c.Impl.NodePort(c, node))
  1470  			scheme := "http"
  1471  			if c.Secure {
  1472  				scheme = "https"
  1473  			}
  1474  			if !strings.HasPrefix(adminurlPath, "/") {
  1475  				adminurlPath = "/" + adminurlPath
  1476  			}
  1477  			url := fmt.Sprintf("%s://%s:%d%s", scheme, host, port, adminurlPath)
  1478  			if adminurlOpen {
  1479  				if err := exec.Command("python", "-m", "webbrowser", url).Run(); err != nil {
  1480  					return err
  1481  				}
  1482  			} else {
  1483  				fmt.Println(url)
  1484  			}
  1485  		}
  1486  		return nil
  1487  	}),
  1488  }
  1489  
  1490  var ipCmd = &cobra.Command{
  1491  	Use:   "ip <cluster>",
  1492  	Short: "get the IP addresses of the nodes in a cluster",
  1493  	Long: `Get the IP addresses of the nodes in a cluster.
  1494  `,
  1495  	Args: cobra.ExactArgs(1),
  1496  	Run: wrap(func(cmd *cobra.Command, args []string) error {
  1497  		c, err := newCluster(args[0])
  1498  		if err != nil {
  1499  			return err
  1500  		}
  1501  
  1502  		nodes := c.ServerNodes()
  1503  		ips := make([]string, len(nodes))
  1504  
  1505  		if external {
  1506  			for i := 0; i < len(nodes); i++ {
  1507  				ips[i] = c.VMs[nodes[i]-1]
  1508  			}
  1509  		} else {
  1510  			c.Parallel("", len(nodes), 0, func(i int) ([]byte, error) {
  1511  				var err error
  1512  				ips[i], err = c.GetInternalIP(nodes[i])
  1513  				return nil, err
  1514  			})
  1515  		}
  1516  
  1517  		for _, ip := range ips {
  1518  			fmt.Println(ip)
  1519  		}
  1520  		return nil
  1521  	}),
  1522  }
  1523  
  1524  func main() {
  1525  	// The commands are displayed in the order they are added to rootCmd. Note
  1526  	// that gcCmd and adminurlCmd contain a trailing \n in their Short help in
  1527  	// order to separate the commands into logical groups.
  1528  	cobra.EnableCommandSorting = false
  1529  	rootCmd.AddCommand(
  1530  		createCmd,
  1531  		destroyCmd,
  1532  		extendCmd,
  1533  		listCmd,
  1534  		syncCmd,
  1535  		gcCmd,
  1536  		setupSSHCmd,
  1537  
  1538  		statusCmd,
  1539  		monitorCmd,
  1540  		startCmd,
  1541  		stopCmd,
  1542  		runCmd,
  1543  		wipeCmd,
  1544  		reformatCmd,
  1545  		installCmd,
  1546  		distributeCertsCmd,
  1547  		putCmd,
  1548  		getCmd,
  1549  		stageCmd,
  1550  		sqlCmd,
  1551  		ipCmd,
  1552  		pgurlCmd,
  1553  		adminurlCmd,
  1554  		logsCmd,
  1555  
  1556  		cachedHostsCmd,
  1557  	)
  1558  	rootCmd.BashCompletionFunction = fmt.Sprintf(`__custom_func()
  1559  	{
  1560  		# only complete the 2nd arg, e.g. adminurl <foo>
  1561  		if ! [ $c -eq 2 ]; then
  1562  			return
  1563  		fi
  1564  
  1565  		# don't complete commands which do not accept a cluster/host arg
  1566  		case ${last_command} in
  1567  			%s)
  1568  				return
  1569  				;;
  1570  		esac
  1571  
  1572  		local hosts_out
  1573  		if hosts_out=$(roachprod cached-hosts --cluster="${cur}" 2>/dev/null); then
  1574  				COMPREPLY=( $( compgen -W "${hosts_out[*]}" -- "$cur" ) )
  1575  		fi
  1576  
  1577  	}`,
  1578  		strings.Join(func(cmds ...*cobra.Command) (s []string) {
  1579  			for _, cmd := range cmds {
  1580  				s = append(s, fmt.Sprintf("%s_%s", rootCmd.Name(), cmd.Name()))
  1581  			}
  1582  			return s
  1583  		}(createCmd, listCmd, syncCmd, gcCmd), " | "),
  1584  	)
  1585  
  1586  	rootCmd.PersistentFlags().BoolVarP(
  1587  		&quiet, "quiet", "q", false, "disable fancy progress output")
  1588  	rootCmd.PersistentFlags().IntVarP(
  1589  		&maxConcurrency, "max-concurrency", "", 32,
  1590  		"maximum number of operations to execute on nodes concurrently, set to zero for infinite")
  1591  	for _, cmd := range []*cobra.Command{createCmd, destroyCmd, extendCmd, logsCmd} {
  1592  		cmd.Flags().StringVarP(&username, "username", "u", os.Getenv("ROACHPROD_USER"),
  1593  			"Username to run under, detect if blank")
  1594  	}
  1595  
  1596  	for _, cmd := range []*cobra.Command{statusCmd, monitorCmd, startCmd,
  1597  		stopCmd, runCmd, wipeCmd, reformatCmd, installCmd, putCmd, getCmd,
  1598  		sqlCmd, pgurlCmd, adminurlCmd, ipCmd,
  1599  	} {
  1600  		cmd.Flags().BoolVar(
  1601  			&ssh.InsecureIgnoreHostKey, "insecure-ignore-host-key", true, "don't check ssh host keys")
  1602  	}
  1603  
  1604  	createCmd.Flags().DurationVarP(&createVMOpts.Lifetime,
  1605  		"lifetime", "l", 12*time.Hour, "Lifetime of the cluster")
  1606  	createCmd.Flags().BoolVar(&createVMOpts.SSDOpts.UseLocalSSD,
  1607  		"local-ssd", true, "Use local SSD")
  1608  	createCmd.Flags().BoolVar(&createVMOpts.SSDOpts.NoExt4Barrier,
  1609  		"local-ssd-no-ext4-barrier", true,
  1610  		`Mount the local SSD with the "-o nobarrier" flag. `+
  1611  			`Ignored if --local-ssd=false is specified.`)
  1612  	createCmd.Flags().IntVarP(&numNodes,
  1613  		"nodes", "n", 4, "Total number of nodes, distributed across all clouds")
  1614  	createCmd.Flags().StringSliceVarP(&createVMOpts.VMProviders,
  1615  		"clouds", "c", []string{gce.ProviderName},
  1616  		fmt.Sprintf("The cloud provider(s) to use when creating new vm instances: %s", vm.AllProviderNames()))
  1617  	createCmd.Flags().BoolVar(&createVMOpts.GeoDistributed,
  1618  		"geo", false, "Create geo-distributed cluster")
  1619  	// Allow each Provider to inject additional configuration flags
  1620  	for _, p := range vm.Providers {
  1621  		p.Flags().ConfigureCreateFlags(createCmd.Flags())
  1622  
  1623  		for _, cmd := range []*cobra.Command{
  1624  			destroyCmd, extendCmd, listCmd, syncCmd, gcCmd,
  1625  		} {
  1626  			p.Flags().ConfigureClusterFlags(cmd.Flags(), vm.AcceptMultipleProjects)
  1627  		}
  1628  		// createCmd only accepts a single GCE project, as opposed to all the other
  1629  		// commands.
  1630  		p.Flags().ConfigureClusterFlags(createCmd.Flags(), vm.SingleProject)
  1631  	}
  1632  
  1633  	destroyCmd.Flags().BoolVarP(&destroyAllMine,
  1634  		"all-mine", "m", false, "Destroy all clusters belonging to the current user")
  1635  
  1636  	extendCmd.Flags().DurationVarP(&extendLifetime,
  1637  		"lifetime", "l", 12*time.Hour, "Lifetime of the cluster")
  1638  
  1639  	listCmd.Flags().BoolVarP(&listDetails,
  1640  		"details", "d", false, "Show cluster details")
  1641  	listCmd.Flags().BoolVar(&listJSON,
  1642  		"json", false, "Show cluster specs in a json format")
  1643  	listCmd.Flags().BoolVarP(&listMine,
  1644  		"mine", "m", false, "Show only clusters belonging to the current user")
  1645  
  1646  	adminurlCmd.Flags().BoolVar(
  1647  		&adminurlOpen, `open`, false, `Open the url in a browser`)
  1648  	adminurlCmd.Flags().StringVar(
  1649  		&adminurlPath, `path`, "/", `Path to add to URL (e.g. to open a same page on each node)`)
  1650  	adminurlCmd.Flags().BoolVar(
  1651  		&adminurlIPs, `ips`, false, `Use Public IPs instead of DNS names in URL`)
  1652  
  1653  	gcCmd.Flags().BoolVarP(
  1654  		&dryrun, "dry-run", "n", dryrun, "dry run (don't perform any actions)")
  1655  	gcCmd.Flags().StringVar(&config.SlackToken, "slack-token", "", "Slack bot token")
  1656  
  1657  	pgurlCmd.Flags().BoolVar(
  1658  		&external, "external", false, "return pgurls for external connections")
  1659  
  1660  	ipCmd.Flags().BoolVar(
  1661  		&external, "external", false, "return external IP addresses")
  1662  
  1663  	runCmd.Flags().BoolVar(
  1664  		&secure, "secure", false, "use a secure cluster")
  1665  
  1666  	startCmd.Flags().IntVarP(&numRacks,
  1667  		"racks", "r", 0, "the number of racks to partition the nodes into")
  1668  
  1669  	stopCmd.Flags().IntVar(&sig, "sig", sig, "signal to pass to kill")
  1670  	stopCmd.Flags().BoolVar(&waitFlag, "wait", waitFlag, "wait for processes to exit")
  1671  
  1672  	wipeCmd.Flags().BoolVar(&wipePreserveCerts, "preserve-certs", false, "do not wipe certificates")
  1673  
  1674  	for _, cmd := range []*cobra.Command{
  1675  		startCmd, statusCmd, stopCmd, runCmd,
  1676  	} {
  1677  		cmd.Flags().StringVar(
  1678  			&tag, "tag", "", "the process tag")
  1679  	}
  1680  
  1681  	for _, cmd := range []*cobra.Command{
  1682  		startCmd, putCmd, getCmd,
  1683  	} {
  1684  		cmd.Flags().BoolVar(new(bool), "scp", false, "DEPRECATED")
  1685  		_ = cmd.Flags().MarkDeprecated("scp", "always true")
  1686  	}
  1687  
  1688  	putCmd.Flags().BoolVar(&useTreeDist, "treedist", useTreeDist, "use treedist copy algorithm")
  1689  
  1690  	stageCmd.Flags().StringVar(&stageOS, "os", "", "operating system override for staged binaries")
  1691  
  1692  	logsCmd.Flags().StringVar(
  1693  		&logsFilter, "filter", "", "re to filter log messages")
  1694  	logsCmd.Flags().Var(
  1695  		flagutil.Time(&logsFrom), "from", "time from which to stream logs")
  1696  	logsCmd.Flags().Var(
  1697  		flagutil.Time(&logsTo), "to", "time to which to stream logs")
  1698  	logsCmd.Flags().DurationVar(
  1699  		&logsInterval, "interval", 200*time.Millisecond, "interval to poll logs from host")
  1700  	logsCmd.Flags().StringVar(
  1701  		&logsDir, "logs-dir", "logs", "path to the logs dir, if remote, relative to username's home dir, ignored if local")
  1702  	logsCmd.Flags().StringVar(
  1703  		&logsProgramFilter, "logs-program", "^cockroach$", "regular expression of the name of program in log files to search")
  1704  
  1705  	monitorCmd.Flags().BoolVar(
  1706  		&monitorIgnoreEmptyNodes,
  1707  		"ignore-empty-nodes",
  1708  		false,
  1709  		"Automatically detect the (subset of the given) nodes which to monitor "+
  1710  			"based on the presence of a nontrivial data directory.")
  1711  
  1712  	monitorCmd.Flags().BoolVar(
  1713  		&monitorOneShot,
  1714  		"oneshot",
  1715  		false,
  1716  		"Report the status of all targeted nodes once, then exit. The exit "+
  1717  			"status is nonzero if (and only if) any node was found not running.")
  1718  
  1719  	cachedHostsCmd.Flags().StringVar(&cachedHostsCluster, "cluster", "", "print hosts matching cluster")
  1720  
  1721  	for _, cmd := range []*cobra.Command{
  1722  		getCmd, putCmd, runCmd, startCmd, statusCmd, stopCmd,
  1723  		wipeCmd, pgurlCmd, adminurlCmd, sqlCmd, installCmd,
  1724  	} {
  1725  		switch cmd {
  1726  		case startCmd:
  1727  			cmd.Flags().BoolVar(
  1728  				&install.StartOpts.Sequential, "sequential", true,
  1729  				"start nodes sequentially so node IDs match hostnames")
  1730  			cmd.Flags().StringArrayVarP(
  1731  				&nodeArgs, "args", "a", nil, "node arguments")
  1732  			cmd.Flags().StringVarP(
  1733  				&nodeEnv, "env", "e", nodeEnv, "node environment variables")
  1734  			cmd.Flags().StringVarP(
  1735  				&clusterType, "type", "t", clusterType, `cluster type ("cockroach" or "cassandra")`)
  1736  			cmd.Flags().BoolVar(
  1737  				&install.StartOpts.Encrypt, "encrypt", encrypt, "start nodes with encryption at rest turned on")
  1738  			fallthrough
  1739  		case sqlCmd:
  1740  			cmd.Flags().StringVarP(
  1741  				&config.Binary, "binary", "b", config.Binary,
  1742  				"the remote cockroach binary to use")
  1743  			fallthrough
  1744  		case pgurlCmd, adminurlCmd:
  1745  			cmd.Flags().BoolVar(
  1746  				&secure, "secure", false, "use a secure cluster")
  1747  		}
  1748  
  1749  		if cmd.Long == "" {
  1750  			cmd.Long = cmd.Short
  1751  		}
  1752  		cmd.Long += fmt.Sprintf(`
  1753  Node specification
  1754  
  1755    By default the operation is performed on all nodes in <cluster>. A subset of
  1756    nodes can be specified by appending :<nodes> to the cluster name. The syntax
  1757    of <nodes> is a comma separated list of specific node IDs or range of
  1758    IDs. For example:
  1759  
  1760      roachprod %[1]s marc-test:1-3,8-9
  1761  
  1762    will perform %[1]s on:
  1763  
  1764      marc-test-1
  1765      marc-test-2
  1766      marc-test-3
  1767      marc-test-8
  1768      marc-test-9
  1769  `, cmd.Name())
  1770  	}
  1771  
  1772  	var err error
  1773  	config.OSUser, err = user.Current()
  1774  	if err != nil {
  1775  		fmt.Fprintf(os.Stderr, "unable to lookup current user: %s\n", err)
  1776  		os.Exit(1)
  1777  	}
  1778  
  1779  	if err := initDirs(); err != nil {
  1780  		fmt.Fprintf(os.Stderr, "%s\n", err)
  1781  		os.Exit(1)
  1782  	}
  1783  
  1784  	if err := loadClusters(); err != nil {
  1785  		// We don't want to exit as we may be looking at the help message.
  1786  		fmt.Printf("problem loading clusters: %s\n", err)
  1787  	}
  1788  
  1789  	if err := rootCmd.Execute(); err != nil {
  1790  		// Cobra has already printed the error message.
  1791  		os.Exit(1)
  1792  	}
  1793  }