github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/main.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/main.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math/rand"
    17  	"os"
    18  	"os/signal"
    19  	"os/user"
    20  	"path/filepath"
    21  	"time"
    22  
    23  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    24  	"github.com/spf13/cobra"
    25  )
    26  
    27  // runnerLogsDir is the dir under the artifacts root where the test runner log
    28  // and other runner-related logs (i.e. cluster creation logs) will be written.
    29  const runnerLogsDir = "_runner-logs"
    30  
    31  func main() {
    32  	rand.Seed(timeutil.Now().UnixNano())
    33  	username := os.Getenv("ROACHPROD_USER")
    34  	parallelism := 10
    35  	var cpuQuota int
    36  	// Path to a local dir where the test logs and artifacts collected from
    37  	// cluster will be placed.
    38  	var artifacts string
    39  	var httpPort int
    40  	var debugEnabled bool
    41  	var clusterID string
    42  	var count = 1
    43  
    44  	cobra.EnableCommandSorting = false
    45  
    46  	var rootCmd = &cobra.Command{
    47  		Use:   "roachtest [command] (flags)",
    48  		Short: "roachtest tool for testing cockroach clusters",
    49  		Long: `roachtest is a tool for testing cockroach clusters.
    50  `,
    51  
    52  		PersistentPreRunE: func(cmd *cobra.Command, _ []string) error {
    53  			// Don't bother checking flags for the default help command.
    54  			if cmd.Name() == "help" {
    55  				return nil
    56  			}
    57  
    58  			if clusterName != "" && local {
    59  				return fmt.Errorf(
    60  					"cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+
    61  						"already exists, --clusters=local will use it",
    62  					clusterName)
    63  			}
    64  			switch cmd.Name() {
    65  			case "run", "bench", "store-gen":
    66  				initBinaries()
    67  			}
    68  			return nil
    69  		},
    70  	}
    71  
    72  	rootCmd.PersistentFlags().StringVarP(
    73  		&clusterName, "cluster", "c", "",
    74  		"Comma-separated list of names existing cluster to use for running tests. "+
    75  			"If fewer than --parallelism names are specified, then the parallelism "+
    76  			"is capped to the number of clusters specified.")
    77  	rootCmd.PersistentFlags().BoolVarP(
    78  		&local, "local", "l", local, "run tests locally")
    79  	rootCmd.PersistentFlags().StringVarP(
    80  		&username, "user", "u", username,
    81  		"Username to use as a cluster name prefix. "+
    82  			"If blank, the current OS user is detected and specified.")
    83  	rootCmd.PersistentFlags().StringVar(
    84  		&cockroach, "cockroach", "", "path to cockroach binary to use")
    85  	rootCmd.PersistentFlags().StringVar(
    86  		&workload, "workload", "", "path to workload binary to use")
    87  	f := rootCmd.PersistentFlags().VarPF(
    88  		&encrypt, "encrypt", "", "start cluster with encryption at rest turned on")
    89  	f.NoOptDefVal = "true"
    90  
    91  	var listBench bool
    92  
    93  	var listCmd = &cobra.Command{
    94  		Use:   "list [tests]",
    95  		Short: "list tests matching the patterns",
    96  		Long: `List tests that match the given name patterns.
    97  
    98  If no pattern is passed, all tests are matched.
    99  Use --bench to list benchmarks instead of tests.
   100  
   101  Each test has a set of tags. The tags are used to skip tests which don't match
   102  the tag filter. The tag filter is specified by specifying a pattern with the
   103  "tag:" prefix. The default tag filter is "tag:default" which matches any test
   104  that has the "default" tag. Note that tests are selected based on their name,
   105  and skipped based on their tag.
   106  
   107  Examples:
   108  
   109     roachtest list acceptance copy/bank/.*false
   110     roachtest list tag:acceptance
   111     roachtest list tag:weekly
   112  `,
   113  		RunE: func(_ *cobra.Command, args []string) error {
   114  			r, err := makeTestRegistry()
   115  			if err != nil {
   116  				return err
   117  			}
   118  			if !listBench {
   119  				registerTests(&r)
   120  			} else {
   121  				registerBenchmarks(&r)
   122  			}
   123  
   124  			matchedTests := r.List(context.Background(), args)
   125  			for _, test := range matchedTests {
   126  				var skip string
   127  				if test.Skip != "" {
   128  					skip = " (skipped: " + test.Skip + ")"
   129  				}
   130  				fmt.Printf("%s [%s]%s\n", test.Name, test.Owner, skip)
   131  			}
   132  			return nil
   133  		},
   134  	}
   135  	listCmd.Flags().BoolVar(
   136  		&listBench, "bench", false, "list benchmarks instead of tests")
   137  
   138  	var runCmd = &cobra.Command{
   139  		// Don't display usage when tests fail.
   140  		SilenceUsage: true,
   141  		Use:          "run [tests]",
   142  		Short:        "run automated tests on cockroach cluster",
   143  		Long: `Run automated tests on existing or ephemeral cockroach clusters.
   144  
   145  roachtest run takes a list of regex patterns and runs all the matching tests.
   146  If no pattern is given, all tests are run. See "help list" for more details on
   147  the test tags.
   148  `,
   149  		RunE: func(_ *cobra.Command, args []string) error {
   150  			return runTests(registerTests, cliCfg{
   151  				args:         args,
   152  				count:        count,
   153  				cpuQuota:     cpuQuota,
   154  				debugEnabled: debugEnabled,
   155  				httpPort:     httpPort,
   156  				parallelism:  parallelism,
   157  				artifactsDir: artifacts,
   158  				user:         username,
   159  				clusterID:    clusterID,
   160  			})
   161  		},
   162  	}
   163  
   164  	runCmd.Flags().StringVar(
   165  		&buildTag, "build-tag", "", "build tag (auto-detect if empty)")
   166  	runCmd.Flags().StringVar(
   167  		&slackToken, "slack-token", "", "Slack bot token")
   168  	runCmd.Flags().BoolVar(
   169  		&teamCity, "teamcity", false, "include teamcity-specific markers in output")
   170  
   171  	var benchCmd = &cobra.Command{
   172  		// Don't display usage when tests fail.
   173  		SilenceUsage: true,
   174  		Use:          "bench [benchmarks]",
   175  		Short:        "run automated benchmarks on cockroach cluster",
   176  		Long:         `Run automated benchmarks on existing or ephemeral cockroach clusters.`,
   177  		RunE: func(_ *cobra.Command, args []string) error {
   178  			return runTests(registerBenchmarks, cliCfg{
   179  				args:         args,
   180  				count:        count,
   181  				cpuQuota:     cpuQuota,
   182  				debugEnabled: debugEnabled,
   183  				httpPort:     httpPort,
   184  				parallelism:  parallelism,
   185  				artifactsDir: artifacts,
   186  				user:         username,
   187  				clusterID:    clusterID,
   188  			})
   189  		},
   190  	}
   191  
   192  	// Register flags shared between `run` and `bench`.
   193  	for _, cmd := range []*cobra.Command{runCmd, benchCmd} {
   194  		cmd.Flags().StringVar(
   195  			&artifacts, "artifacts", "artifacts", "path to artifacts directory")
   196  		cmd.Flags().StringVar(
   197  			&cloud, "cloud", cloud, "cloud provider to use (aws, azure, or gce)")
   198  		cmd.Flags().StringVar(
   199  			&clusterID, "cluster-id", "", "an identifier to use in the test cluster's name")
   200  		cmd.Flags().IntVar(
   201  			&count, "count", 1, "the number of times to run each test")
   202  		cmd.Flags().BoolVarP(
   203  			&debugEnabled, "debug", "d", debugEnabled, "don't wipe and destroy cluster if test fails")
   204  		cmd.Flags().IntVarP(
   205  			&parallelism, "parallelism", "p", parallelism, "number of tests to run in parallel")
   206  		cmd.Flags().StringVar(
   207  			&roachprod, "roachprod", "", "path to roachprod binary to use")
   208  		cmd.Flags().BoolVar(
   209  			&clusterWipe, "wipe", true,
   210  			"wipe existing cluster before starting test (for use with --cluster)")
   211  		cmd.Flags().StringVar(
   212  			&zonesF, "zones", "",
   213  			"Zones for the cluster. (non-geo tests use the first zone, geo tests use all zones) "+
   214  				"(uses roachprod defaults if empty)")
   215  		cmd.Flags().StringVar(
   216  			&instanceType, "instance-type", instanceType,
   217  			"the instance type to use (see https://aws.amazon.com/ec2/instance-types/, https://cloud.google.com/compute/docs/machine-types or https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes)")
   218  		cmd.Flags().IntVar(
   219  			&cpuQuota, "cpu-quota", 300,
   220  			"The number of cloud CPUs roachtest is allowed to use at any one time.")
   221  		cmd.Flags().IntVar(
   222  			&httpPort, "port", 8080, "the port on which to serve the HTTP interface")
   223  	}
   224  
   225  	rootCmd.AddCommand(listCmd)
   226  	rootCmd.AddCommand(runCmd)
   227  	rootCmd.AddCommand(benchCmd)
   228  
   229  	if err := rootCmd.Execute(); err != nil {
   230  		// Cobra has already printed the error message.
   231  		os.Exit(1)
   232  	}
   233  }
   234  
   235  type cliCfg struct {
   236  	args         []string
   237  	count        int
   238  	cpuQuota     int
   239  	debugEnabled bool
   240  	httpPort     int
   241  	parallelism  int
   242  	artifactsDir string
   243  	user         string
   244  	clusterID    string
   245  }
   246  
   247  func runTests(register func(*testRegistry), cfg cliCfg) error {
   248  	if cfg.count <= 0 {
   249  		return fmt.Errorf("--count (%d) must by greater than 0", cfg.count)
   250  	}
   251  	r, err := makeTestRegistry()
   252  	if err != nil {
   253  		return err
   254  	}
   255  	register(&r)
   256  	cr := newClusterRegistry()
   257  	runner := newTestRunner(cr, r.buildVersion)
   258  
   259  	filter := newFilter(cfg.args)
   260  	clusterType := roachprodCluster
   261  	if local {
   262  		clusterType = localCluster
   263  		if cfg.parallelism != 1 {
   264  			fmt.Printf("--local specified. Overriding --parallelism to 1.\n")
   265  			cfg.parallelism = 1
   266  		}
   267  	}
   268  	opt := clustersOpt{
   269  		typ:                       clusterType,
   270  		clusterName:               clusterName,
   271  		user:                      getUser(cfg.user),
   272  		cpuQuota:                  cfg.cpuQuota,
   273  		keepClustersOnTestFailure: cfg.debugEnabled,
   274  		clusterID:                 cfg.clusterID,
   275  	}
   276  	if err := runner.runHTTPServer(cfg.httpPort, os.Stdout); err != nil {
   277  		return err
   278  	}
   279  
   280  	tests := testsToRun(context.Background(), r, filter)
   281  	n := len(tests)
   282  	if n*cfg.count < cfg.parallelism {
   283  		// Don't spin up more workers than necessary. This has particular
   284  		// implications for the common case of running a single test once: if
   285  		// parallelism is set to 1, we'll use teeToStdout below to get logs to
   286  		// stdout/stderr.
   287  		cfg.parallelism = n * cfg.count
   288  	}
   289  	runnerDir := filepath.Join(cfg.artifactsDir, runnerLogsDir)
   290  	runnerLogPath := filepath.Join(
   291  		runnerDir, fmt.Sprintf("test_runner-%d.log", timeutil.Now().Unix()))
   292  	l, tee := testRunnerLogger(context.Background(), cfg.parallelism, runnerLogPath)
   293  	lopt := loggingOpt{
   294  		l:             l,
   295  		tee:           tee,
   296  		stdout:        os.Stdout,
   297  		stderr:        os.Stderr,
   298  		artifactsDir:  cfg.artifactsDir,
   299  		runnerLogPath: runnerLogPath,
   300  	}
   301  
   302  	// We're going to run all the workers (and thus all the tests) in a context
   303  	// that gets canceled when the Interrupt signal is received.
   304  	ctx, cancel := context.WithCancel(context.Background())
   305  	defer cancel()
   306  	CtrlC(ctx, l, cancel, cr)
   307  	err = runner.Run(ctx, tests, cfg.count, cfg.parallelism, opt, cfg.artifactsDir, lopt)
   308  
   309  	// Make sure we attempt to clean up. We run with a non-canceled ctx; the
   310  	// ctx above might be canceled in case a signal was received. If that's
   311  	// the case, we're running under a 5s timeout until the CtrlC() goroutine
   312  	// kills the process.
   313  	l.PrintfCtx(ctx, "runTests destroying all clusters")
   314  	cr.destroyAllClusters(context.Background(), l)
   315  
   316  	if teamCity {
   317  		// Collect the runner logs.
   318  		fmt.Printf("##teamcity[publishArtifacts '%s']\n", runnerDir)
   319  	}
   320  	return err
   321  }
   322  
   323  // getUser takes the value passed on the command line and comes up with the
   324  // username to use.
   325  func getUser(userFlag string) string {
   326  	if userFlag != "" {
   327  		return userFlag
   328  	}
   329  	usr, err := user.Current()
   330  	if err != nil {
   331  		panic(fmt.Sprintf("user.Current: %s", err))
   332  	}
   333  	return usr.Username
   334  }
   335  
   336  // CtrlC spawns a goroutine that sits around waiting for SIGINT. Once the first
   337  // signal is received, it calls cancel(), waits 5 seconds, and then calls
   338  // cr.destroyAllClusters(). The expectation is that the main goroutine will
   339  // respond to the cancelation and return, and so the process will be dead by the
   340  // time the 5s elapse.
   341  // If a 2nd signal is received, it calls os.Exit(2).
   342  func CtrlC(ctx context.Context, l *logger, cancel func(), cr *clusterRegistry) {
   343  	// Shut down test clusters when interrupted (for example CTRL-C).
   344  	sig := make(chan os.Signal, 1)
   345  	signal.Notify(sig, os.Interrupt)
   346  	go func() {
   347  		<-sig
   348  		shout(ctx, l, os.Stderr,
   349  			"Signaled received. Canceling workers and waiting up to 5s for them.")
   350  		// Signal runner.Run() to stop.
   351  		cancel()
   352  		<-time.After(5 * time.Second)
   353  		shout(ctx, l, os.Stderr, "5s elapsed. Will brutally destroy all clusters.")
   354  		// Make sure there are no leftover clusters.
   355  		destroyCh := make(chan struct{})
   356  		go func() {
   357  			// Destroy all clusters. Don't wait more than 5 min for that though.
   358  			destroyCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
   359  			l.PrintfCtx(ctx, "CtrlC handler destroying all clusters")
   360  			cr.destroyAllClusters(destroyCtx, l)
   361  			cancel()
   362  			close(destroyCh)
   363  		}()
   364  		// If we get a second CTRL-C, exit immediately.
   365  		select {
   366  		case <-sig:
   367  			shout(ctx, l, os.Stderr, "Second SIGINT received. Quitting.")
   368  			os.Exit(2)
   369  		case <-destroyCh:
   370  			shout(ctx, l, os.Stderr, "Done destroying all clusters.")
   371  		}
   372  	}()
   373  }
   374  
   375  // testRunnerLogger returns a logger to be used by the test runner and a tee
   376  // option for the test logs.
   377  //
   378  // runnerLogPath is the path to the file that will contain the runner's log.
   379  func testRunnerLogger(
   380  	ctx context.Context, parallelism int, runnerLogPath string,
   381  ) (*logger, teeOptType) {
   382  	teeOpt := noTee
   383  	if parallelism == 1 {
   384  		teeOpt = teeToStdout
   385  	}
   386  
   387  	var l *logger
   388  	if teeOpt == teeToStdout {
   389  		verboseCfg := loggerConfig{stdout: os.Stdout, stderr: os.Stderr}
   390  		var err error
   391  		l, err = verboseCfg.newLogger(runnerLogPath)
   392  		if err != nil {
   393  			panic(err)
   394  		}
   395  	} else {
   396  		verboseCfg := loggerConfig{}
   397  		var err error
   398  		l, err = verboseCfg.newLogger(runnerLogPath)
   399  		if err != nil {
   400  			panic(err)
   401  		}
   402  	}
   403  	shout(ctx, l, os.Stdout, "test runner logs in: %s", runnerLogPath)
   404  	return l, teeOpt
   405  }
   406  
   407  func testsToRun(ctx context.Context, r testRegistry, filter *testFilter) []testSpec {
   408  	tests := r.GetTests(ctx, filter)
   409  
   410  	var notSkipped []testSpec
   411  	for _, s := range tests {
   412  		if s.Skip == "" {
   413  			notSkipped = append(notSkipped, s)
   414  		} else {
   415  			if teamCity {
   416  				fmt.Fprintf(os.Stdout, "##teamcity[testIgnored name='%s' message='%s']\n",
   417  					s.Name, teamCityEscape(s.Skip))
   418  			}
   419  			fmt.Fprintf(os.Stdout, "--- SKIP: %s (%s)\n\t%s\n", s.Name, "0.00s", s.Skip)
   420  		}
   421  	}
   422  	return notSkipped
   423  }