github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/main.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 "fmt" 16 "math/rand" 17 "os" 18 "os/signal" 19 "os/user" 20 "path/filepath" 21 "time" 22 23 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 24 "github.com/spf13/cobra" 25 ) 26 27 // runnerLogsDir is the dir under the artifacts root where the test runner log 28 // and other runner-related logs (i.e. cluster creation logs) will be written. 29 const runnerLogsDir = "_runner-logs" 30 31 func main() { 32 rand.Seed(timeutil.Now().UnixNano()) 33 username := os.Getenv("ROACHPROD_USER") 34 parallelism := 10 35 var cpuQuota int 36 // Path to a local dir where the test logs and artifacts collected from 37 // cluster will be placed. 38 var artifacts string 39 var httpPort int 40 var debugEnabled bool 41 var clusterID string 42 var count = 1 43 44 cobra.EnableCommandSorting = false 45 46 var rootCmd = &cobra.Command{ 47 Use: "roachtest [command] (flags)", 48 Short: "roachtest tool for testing cockroach clusters", 49 Long: `roachtest is a tool for testing cockroach clusters. 50 `, 51 52 PersistentPreRunE: func(cmd *cobra.Command, _ []string) error { 53 // Don't bother checking flags for the default help command. 54 if cmd.Name() == "help" { 55 return nil 56 } 57 58 if clusterName != "" && local { 59 return fmt.Errorf( 60 "cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+ 61 "already exists, --clusters=local will use it", 62 clusterName) 63 } 64 switch cmd.Name() { 65 case "run", "bench", "store-gen": 66 initBinaries() 67 } 68 return nil 69 }, 70 } 71 72 rootCmd.PersistentFlags().StringVarP( 73 &clusterName, "cluster", "c", "", 74 "Comma-separated list of names existing cluster to use for running tests. "+ 75 "If fewer than --parallelism names are specified, then the parallelism "+ 76 "is capped to the number of clusters specified.") 77 rootCmd.PersistentFlags().BoolVarP( 78 &local, "local", "l", local, "run tests locally") 79 rootCmd.PersistentFlags().StringVarP( 80 &username, "user", "u", username, 81 "Username to use as a cluster name prefix. "+ 82 "If blank, the current OS user is detected and specified.") 83 rootCmd.PersistentFlags().StringVar( 84 &cockroach, "cockroach", "", "path to cockroach binary to use") 85 rootCmd.PersistentFlags().StringVar( 86 &workload, "workload", "", "path to workload binary to use") 87 f := rootCmd.PersistentFlags().VarPF( 88 &encrypt, "encrypt", "", "start cluster with encryption at rest turned on") 89 f.NoOptDefVal = "true" 90 91 var listBench bool 92 93 var listCmd = &cobra.Command{ 94 Use: "list [tests]", 95 Short: "list tests matching the patterns", 96 Long: `List tests that match the given name patterns. 97 98 If no pattern is passed, all tests are matched. 99 Use --bench to list benchmarks instead of tests. 100 101 Each test has a set of tags. The tags are used to skip tests which don't match 102 the tag filter. The tag filter is specified by specifying a pattern with the 103 "tag:" prefix. The default tag filter is "tag:default" which matches any test 104 that has the "default" tag. Note that tests are selected based on their name, 105 and skipped based on their tag. 106 107 Examples: 108 109 roachtest list acceptance copy/bank/.*false 110 roachtest list tag:acceptance 111 roachtest list tag:weekly 112 `, 113 RunE: func(_ *cobra.Command, args []string) error { 114 r, err := makeTestRegistry() 115 if err != nil { 116 return err 117 } 118 if !listBench { 119 registerTests(&r) 120 } else { 121 registerBenchmarks(&r) 122 } 123 124 matchedTests := r.List(context.Background(), args) 125 for _, test := range matchedTests { 126 var skip string 127 if test.Skip != "" { 128 skip = " (skipped: " + test.Skip + ")" 129 } 130 fmt.Printf("%s [%s]%s\n", test.Name, test.Owner, skip) 131 } 132 return nil 133 }, 134 } 135 listCmd.Flags().BoolVar( 136 &listBench, "bench", false, "list benchmarks instead of tests") 137 138 var runCmd = &cobra.Command{ 139 // Don't display usage when tests fail. 140 SilenceUsage: true, 141 Use: "run [tests]", 142 Short: "run automated tests on cockroach cluster", 143 Long: `Run automated tests on existing or ephemeral cockroach clusters. 144 145 roachtest run takes a list of regex patterns and runs all the matching tests. 146 If no pattern is given, all tests are run. See "help list" for more details on 147 the test tags. 148 `, 149 RunE: func(_ *cobra.Command, args []string) error { 150 return runTests(registerTests, cliCfg{ 151 args: args, 152 count: count, 153 cpuQuota: cpuQuota, 154 debugEnabled: debugEnabled, 155 httpPort: httpPort, 156 parallelism: parallelism, 157 artifactsDir: artifacts, 158 user: username, 159 clusterID: clusterID, 160 }) 161 }, 162 } 163 164 runCmd.Flags().StringVar( 165 &buildTag, "build-tag", "", "build tag (auto-detect if empty)") 166 runCmd.Flags().StringVar( 167 &slackToken, "slack-token", "", "Slack bot token") 168 runCmd.Flags().BoolVar( 169 &teamCity, "teamcity", false, "include teamcity-specific markers in output") 170 171 var benchCmd = &cobra.Command{ 172 // Don't display usage when tests fail. 173 SilenceUsage: true, 174 Use: "bench [benchmarks]", 175 Short: "run automated benchmarks on cockroach cluster", 176 Long: `Run automated benchmarks on existing or ephemeral cockroach clusters.`, 177 RunE: func(_ *cobra.Command, args []string) error { 178 return runTests(registerBenchmarks, cliCfg{ 179 args: args, 180 count: count, 181 cpuQuota: cpuQuota, 182 debugEnabled: debugEnabled, 183 httpPort: httpPort, 184 parallelism: parallelism, 185 artifactsDir: artifacts, 186 user: username, 187 clusterID: clusterID, 188 }) 189 }, 190 } 191 192 // Register flags shared between `run` and `bench`. 193 for _, cmd := range []*cobra.Command{runCmd, benchCmd} { 194 cmd.Flags().StringVar( 195 &artifacts, "artifacts", "artifacts", "path to artifacts directory") 196 cmd.Flags().StringVar( 197 &cloud, "cloud", cloud, "cloud provider to use (aws, azure, or gce)") 198 cmd.Flags().StringVar( 199 &clusterID, "cluster-id", "", "an identifier to use in the test cluster's name") 200 cmd.Flags().IntVar( 201 &count, "count", 1, "the number of times to run each test") 202 cmd.Flags().BoolVarP( 203 &debugEnabled, "debug", "d", debugEnabled, "don't wipe and destroy cluster if test fails") 204 cmd.Flags().IntVarP( 205 ¶llelism, "parallelism", "p", parallelism, "number of tests to run in parallel") 206 cmd.Flags().StringVar( 207 &roachprod, "roachprod", "", "path to roachprod binary to use") 208 cmd.Flags().BoolVar( 209 &clusterWipe, "wipe", true, 210 "wipe existing cluster before starting test (for use with --cluster)") 211 cmd.Flags().StringVar( 212 &zonesF, "zones", "", 213 "Zones for the cluster. (non-geo tests use the first zone, geo tests use all zones) "+ 214 "(uses roachprod defaults if empty)") 215 cmd.Flags().StringVar( 216 &instanceType, "instance-type", instanceType, 217 "the instance type to use (see https://aws.amazon.com/ec2/instance-types/, https://cloud.google.com/compute/docs/machine-types or https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes)") 218 cmd.Flags().IntVar( 219 &cpuQuota, "cpu-quota", 300, 220 "The number of cloud CPUs roachtest is allowed to use at any one time.") 221 cmd.Flags().IntVar( 222 &httpPort, "port", 8080, "the port on which to serve the HTTP interface") 223 } 224 225 rootCmd.AddCommand(listCmd) 226 rootCmd.AddCommand(runCmd) 227 rootCmd.AddCommand(benchCmd) 228 229 if err := rootCmd.Execute(); err != nil { 230 // Cobra has already printed the error message. 231 os.Exit(1) 232 } 233 } 234 235 type cliCfg struct { 236 args []string 237 count int 238 cpuQuota int 239 debugEnabled bool 240 httpPort int 241 parallelism int 242 artifactsDir string 243 user string 244 clusterID string 245 } 246 247 func runTests(register func(*testRegistry), cfg cliCfg) error { 248 if cfg.count <= 0 { 249 return fmt.Errorf("--count (%d) must by greater than 0", cfg.count) 250 } 251 r, err := makeTestRegistry() 252 if err != nil { 253 return err 254 } 255 register(&r) 256 cr := newClusterRegistry() 257 runner := newTestRunner(cr, r.buildVersion) 258 259 filter := newFilter(cfg.args) 260 clusterType := roachprodCluster 261 if local { 262 clusterType = localCluster 263 if cfg.parallelism != 1 { 264 fmt.Printf("--local specified. Overriding --parallelism to 1.\n") 265 cfg.parallelism = 1 266 } 267 } 268 opt := clustersOpt{ 269 typ: clusterType, 270 clusterName: clusterName, 271 user: getUser(cfg.user), 272 cpuQuota: cfg.cpuQuota, 273 keepClustersOnTestFailure: cfg.debugEnabled, 274 clusterID: cfg.clusterID, 275 } 276 if err := runner.runHTTPServer(cfg.httpPort, os.Stdout); err != nil { 277 return err 278 } 279 280 tests := testsToRun(context.Background(), r, filter) 281 n := len(tests) 282 if n*cfg.count < cfg.parallelism { 283 // Don't spin up more workers than necessary. This has particular 284 // implications for the common case of running a single test once: if 285 // parallelism is set to 1, we'll use teeToStdout below to get logs to 286 // stdout/stderr. 287 cfg.parallelism = n * cfg.count 288 } 289 runnerDir := filepath.Join(cfg.artifactsDir, runnerLogsDir) 290 runnerLogPath := filepath.Join( 291 runnerDir, fmt.Sprintf("test_runner-%d.log", timeutil.Now().Unix())) 292 l, tee := testRunnerLogger(context.Background(), cfg.parallelism, runnerLogPath) 293 lopt := loggingOpt{ 294 l: l, 295 tee: tee, 296 stdout: os.Stdout, 297 stderr: os.Stderr, 298 artifactsDir: cfg.artifactsDir, 299 runnerLogPath: runnerLogPath, 300 } 301 302 // We're going to run all the workers (and thus all the tests) in a context 303 // that gets canceled when the Interrupt signal is received. 304 ctx, cancel := context.WithCancel(context.Background()) 305 defer cancel() 306 CtrlC(ctx, l, cancel, cr) 307 err = runner.Run(ctx, tests, cfg.count, cfg.parallelism, opt, cfg.artifactsDir, lopt) 308 309 // Make sure we attempt to clean up. We run with a non-canceled ctx; the 310 // ctx above might be canceled in case a signal was received. If that's 311 // the case, we're running under a 5s timeout until the CtrlC() goroutine 312 // kills the process. 313 l.PrintfCtx(ctx, "runTests destroying all clusters") 314 cr.destroyAllClusters(context.Background(), l) 315 316 if teamCity { 317 // Collect the runner logs. 318 fmt.Printf("##teamcity[publishArtifacts '%s']\n", runnerDir) 319 } 320 return err 321 } 322 323 // getUser takes the value passed on the command line and comes up with the 324 // username to use. 325 func getUser(userFlag string) string { 326 if userFlag != "" { 327 return userFlag 328 } 329 usr, err := user.Current() 330 if err != nil { 331 panic(fmt.Sprintf("user.Current: %s", err)) 332 } 333 return usr.Username 334 } 335 336 // CtrlC spawns a goroutine that sits around waiting for SIGINT. Once the first 337 // signal is received, it calls cancel(), waits 5 seconds, and then calls 338 // cr.destroyAllClusters(). The expectation is that the main goroutine will 339 // respond to the cancelation and return, and so the process will be dead by the 340 // time the 5s elapse. 341 // If a 2nd signal is received, it calls os.Exit(2). 342 func CtrlC(ctx context.Context, l *logger, cancel func(), cr *clusterRegistry) { 343 // Shut down test clusters when interrupted (for example CTRL-C). 344 sig := make(chan os.Signal, 1) 345 signal.Notify(sig, os.Interrupt) 346 go func() { 347 <-sig 348 shout(ctx, l, os.Stderr, 349 "Signaled received. Canceling workers and waiting up to 5s for them.") 350 // Signal runner.Run() to stop. 351 cancel() 352 <-time.After(5 * time.Second) 353 shout(ctx, l, os.Stderr, "5s elapsed. Will brutally destroy all clusters.") 354 // Make sure there are no leftover clusters. 355 destroyCh := make(chan struct{}) 356 go func() { 357 // Destroy all clusters. Don't wait more than 5 min for that though. 358 destroyCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 359 l.PrintfCtx(ctx, "CtrlC handler destroying all clusters") 360 cr.destroyAllClusters(destroyCtx, l) 361 cancel() 362 close(destroyCh) 363 }() 364 // If we get a second CTRL-C, exit immediately. 365 select { 366 case <-sig: 367 shout(ctx, l, os.Stderr, "Second SIGINT received. Quitting.") 368 os.Exit(2) 369 case <-destroyCh: 370 shout(ctx, l, os.Stderr, "Done destroying all clusters.") 371 } 372 }() 373 } 374 375 // testRunnerLogger returns a logger to be used by the test runner and a tee 376 // option for the test logs. 377 // 378 // runnerLogPath is the path to the file that will contain the runner's log. 379 func testRunnerLogger( 380 ctx context.Context, parallelism int, runnerLogPath string, 381 ) (*logger, teeOptType) { 382 teeOpt := noTee 383 if parallelism == 1 { 384 teeOpt = teeToStdout 385 } 386 387 var l *logger 388 if teeOpt == teeToStdout { 389 verboseCfg := loggerConfig{stdout: os.Stdout, stderr: os.Stderr} 390 var err error 391 l, err = verboseCfg.newLogger(runnerLogPath) 392 if err != nil { 393 panic(err) 394 } 395 } else { 396 verboseCfg := loggerConfig{} 397 var err error 398 l, err = verboseCfg.newLogger(runnerLogPath) 399 if err != nil { 400 panic(err) 401 } 402 } 403 shout(ctx, l, os.Stdout, "test runner logs in: %s", runnerLogPath) 404 return l, teeOpt 405 } 406 407 func testsToRun(ctx context.Context, r testRegistry, filter *testFilter) []testSpec { 408 tests := r.GetTests(ctx, filter) 409 410 var notSkipped []testSpec 411 for _, s := range tests { 412 if s.Skip == "" { 413 notSkipped = append(notSkipped, s) 414 } else { 415 if teamCity { 416 fmt.Fprintf(os.Stdout, "##teamcity[testIgnored name='%s' message='%s']\n", 417 s.Name, teamCityEscape(s.Skip)) 418 } 419 fmt.Fprintf(os.Stdout, "--- SKIP: %s (%s)\n\t%s\n", s.Name, "0.00s", s.Skip) 420 } 421 } 422 return notSkipped 423 }