github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/test_runner.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "context" 15 "fmt" 16 "io" 17 "math/rand" 18 "net" 19 "net/http" 20 "os" 21 "os/exec" 22 "path/filepath" 23 "runtime" 24 "sort" 25 "strconv" 26 "strings" 27 "sync" 28 "time" 29 30 "github.com/cockroachdb/cockroach/pkg/cmd/internal/issues" 31 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 32 "github.com/cockroachdb/cockroach/pkg/util/log" 33 "github.com/cockroachdb/cockroach/pkg/util/quotapool" 34 "github.com/cockroachdb/cockroach/pkg/util/stop" 35 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 36 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 37 "github.com/cockroachdb/cockroach/pkg/util/version" 38 "github.com/cockroachdb/errors" 39 "github.com/cockroachdb/logtags" 40 "github.com/petermattis/goid" 41 ) 42 43 // testRunner runs tests. 44 type testRunner struct { 45 // buildVersion is the version of the Cockroach binary that tests will run against. 46 buildVersion version.Version 47 48 config struct { 49 // skipClusterValidationOnAttach skips validation on existing clusters that 50 // the registry uses for running tests. 51 skipClusterValidationOnAttach bool 52 // skipClusterStopOnAttach skips stopping existing clusters that 53 // the registry uses for running tests. It implies skipClusterWipeOnAttach. 54 skipClusterStopOnAttach bool 55 skipClusterWipeOnAttach bool 56 } 57 58 status struct { 59 syncutil.Mutex 60 running map[*test]struct{} 61 pass map[*test]struct{} 62 fail map[*test]struct{} 63 skip map[*test]struct{} 64 } 65 66 // cr keeps track of all live clusters. 67 cr *clusterRegistry 68 69 workersMu struct { 70 syncutil.Mutex 71 // workers is a map from worker name to information about each worker currently running tests. 72 workers map[string]*workerStatus 73 } 74 75 // work maintains the remaining tests to run. 76 work *workPool 77 78 completedTestsMu struct { 79 syncutil.Mutex 80 // completed maintains information on all completed test runs. 81 completed []completedTestInfo 82 } 83 } 84 85 // newTestRunner constructs a testRunner. 86 // 87 // cr: The cluster registry with which all clusters will be registered. The 88 // caller provides this as the caller needs to be able to shut clusters down 89 // on Ctrl+C. 90 // buildVersion: The version of the Cockroach binary against which tests will run. 91 func newTestRunner(cr *clusterRegistry, buildVersion version.Version) *testRunner { 92 r := &testRunner{ 93 cr: cr, 94 buildVersion: buildVersion, 95 } 96 r.config.skipClusterWipeOnAttach = !clusterWipe 97 r.workersMu.workers = make(map[string]*workerStatus) 98 return r 99 } 100 101 // clustersOpt groups options for the clusters to be used by the tests. 102 type clustersOpt struct { 103 // The type of cluster to use. If localCluster, then no other fields can be 104 // set. 105 typ clusterType 106 107 // If set, all the tests will run against this roachprod cluster. 108 clusterName string 109 110 // If set, all the clusters will use this ID as part of their name. When 111 // roachtests is invoked by TeamCity, this will be the build id. 112 clusterID string 113 // The name of the user running the tests. This will be part of cluster names. 114 user string 115 116 // cpuQuota specifies how many CPUs can be used concurrently by the roachprod 117 // clusters. While there's no quota available for creating a new cluster, the 118 // test runner will wait for other tests to finish and their cluster to be 119 // destroyed (or reused). Note that this limit is global, not per zone. 120 cpuQuota int 121 // If set, clusters will not be wiped or destroyed when a test using the 122 // respective cluster fails. These cluster will linger around and they'll 123 // continue counting towards the cpuQuota. 124 keepClustersOnTestFailure bool 125 } 126 127 func (c clustersOpt) validate() error { 128 if c.typ == localCluster { 129 if c.clusterName != "" { 130 return errors.New("clusterName cannot be set when typ=localCluster") 131 } 132 if c.clusterID != "" { 133 return errors.New("clusterID cannot be set when typ=localCluster") 134 } 135 } 136 return nil 137 } 138 139 // Run runs tests. 140 // 141 // Args: 142 // tests: The tests to run. 143 // count: How many times to run each test selected by filter. 144 // parallelism: How many workers to use for running tests. Tests are run 145 // locally (although generally they run against remote roachprod clusters). 146 // parallelism bounds the maximum number of tests that run concurrently. Note 147 // that the concurrency is also affected by cpuQuota. 148 // clusterOpt: Options for the clusters to use by tests. 149 // lopt: Options for logging. 150 func (r *testRunner) Run( 151 ctx context.Context, 152 tests []testSpec, 153 count int, 154 parallelism int, 155 clustersOpt clustersOpt, 156 artifactsDir string, 157 lopt loggingOpt, 158 ) error { 159 // Validate options. 160 if len(tests) == 0 { 161 return fmt.Errorf("no test matched filters") 162 } 163 if err := clustersOpt.validate(); err != nil { 164 return err 165 } 166 if parallelism != 1 { 167 if clustersOpt.clusterName != "" { 168 return fmt.Errorf("--cluster incompatible with --parallelism. Use --parallelism=1") 169 } 170 if clustersOpt.typ == localCluster { 171 return fmt.Errorf("--local incompatible with --parallelism. Use --parallelism=1") 172 } 173 } 174 175 if name := clustersOpt.clusterName; name != "" { 176 // Since we were given a cluster, check that all tests we have to run have compatible specs. 177 // We should also check against the spec of the cluster, but we don't 178 // currently have a way of doing that; we're relying on the fact that attaching to the cluster 179 // will fail if the cluster is incompatible. 180 spec := tests[0].Cluster 181 spec.Lifetime = 0 182 for i := 1; i < len(tests); i++ { 183 spec2 := tests[i].Cluster 184 spec2.Lifetime = 0 185 if spec != spec2 { 186 return errors.Errorf("cluster specified but found tests "+ 187 "with incompatible specs: %s (%s) - %s (%s)", 188 tests[0].Name, spec, tests[i].Name, spec2, 189 ) 190 } 191 } 192 } 193 194 var numConcurrentClusterCreations int 195 if cloud == "aws" { 196 // AWS has ridiculous API calls limits, so we're going to create one cluster 197 // at a time. Internally, roachprod has throttling for the calls required to 198 // create a single cluster. 199 numConcurrentClusterCreations = 1 200 } else { 201 numConcurrentClusterCreations = 1000 202 } 203 clusterFactory := newClusterFactory( 204 clustersOpt.user, clustersOpt.clusterID, lopt.artifactsDir, r.cr, numConcurrentClusterCreations) 205 206 // allocateCluster will be used by workers to create new clusters (or to attach 207 // to an existing one). 208 allocateCluster := func( 209 ctx context.Context, 210 t testSpec, 211 alloc *quotapool.IntAlloc, 212 artifactsDir string, 213 wStatus *workerStatus, 214 ) (*cluster, error) { 215 wStatus.SetStatus("creating cluster") 216 defer wStatus.SetStatus("") 217 218 lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s", t.Name, t.Cluster) 219 220 existingClusterName := clustersOpt.clusterName 221 if existingClusterName != "" { 222 // Logs for attaching to a cluster go to a dedicated log file. 223 logPath := filepath.Join(artifactsDir, runnerLogsDir, "cluster-create", existingClusterName+".log") 224 clusterL, err := rootLogger(logPath, lopt.tee) 225 defer clusterL.close() 226 if err != nil { 227 return nil, err 228 } 229 opt := attachOpt{ 230 skipValidation: r.config.skipClusterValidationOnAttach, 231 skipStop: r.config.skipClusterStopOnAttach, 232 skipWipe: r.config.skipClusterWipeOnAttach, 233 } 234 return attachToExistingCluster(ctx, existingClusterName, clusterL, t.Cluster, opt, r.cr) 235 } 236 237 cfg := clusterConfig{ 238 spec: t.Cluster, 239 artifactsDir: artifactsDir, 240 localCluster: clustersOpt.typ == localCluster, 241 alloc: alloc, 242 } 243 return clusterFactory.newCluster(ctx, cfg, wStatus.SetStatus, lopt.tee) 244 } 245 246 // Seed the default rand source so that different runs get different cluster 247 // IDs. 248 rand.Seed(timeutil.Now().UnixNano()) 249 250 n := len(tests) 251 if n*count < parallelism { 252 // Don't spin up more workers than necessary. 253 parallelism = n * count 254 } 255 256 r.status.running = make(map[*test]struct{}) 257 r.status.pass = make(map[*test]struct{}) 258 r.status.fail = make(map[*test]struct{}) 259 r.status.skip = make(map[*test]struct{}) 260 261 r.work = newWorkPool(tests, count) 262 stopper := stop.NewStopper() 263 errs := &workerErrors{} 264 265 qp := quotapool.NewIntPool("cloud cpu", uint64(clustersOpt.cpuQuota)) 266 l := lopt.l 267 268 var wg sync.WaitGroup 269 for i := 0; i < parallelism; i++ { 270 i := i // Copy for closure. 271 wg.Add(1) 272 stopper.RunWorker(ctx, func(ctx context.Context) { 273 defer wg.Done() 274 275 if err := r.runWorker( 276 ctx, fmt.Sprintf("w%d", i) /* name */, r.work, qp, 277 stopper.ShouldQuiesce(), 278 clustersOpt.keepClustersOnTestFailure, 279 lopt.artifactsDir, lopt.runnerLogPath, lopt.tee, lopt.stdout, 280 allocateCluster, 281 l, 282 ); err != nil { 283 // A worker returned an error. Let's shut down. 284 msg := fmt.Sprintf("Worker %d returned with error. Quiescing. Error: %+v", i, err) 285 shout(ctx, l, lopt.stdout, msg) 286 errs.AddErr(err) 287 // Quiesce the stopper. This will cause all workers to not pick up more 288 // tests after finishing the currently running one. 289 stopper.Quiesce(ctx) 290 // Interrupt everybody waiting for resources. 291 if qp != nil { 292 qp.Close(msg) 293 } 294 } 295 }) 296 } 297 298 // Wait for all the workers to finish. 299 wg.Wait() 300 r.cr.destroyAllClusters(ctx, l) 301 302 if errs.Err() != nil { 303 shout(ctx, l, lopt.stdout, "FAIL (err: %s)", errs.Err()) 304 return errs.Err() 305 } 306 passFailLine := r.generateReport() 307 shout(ctx, l, lopt.stdout, passFailLine) 308 if len(r.status.fail) > 0 { 309 return fmt.Errorf("some tests failed") 310 } 311 return nil 312 } 313 314 type clusterAllocatorFn func( 315 ctx context.Context, 316 t testSpec, 317 alloc *quotapool.IntAlloc, 318 artifactsDir string, 319 wStatus *workerStatus, 320 ) (*cluster, error) 321 322 // runWorker runs tests in a loop until work is exhausted. 323 // 324 // Errors are returned in exceptional circumstances, like when a cluster failed 325 // to be created or when a test timed out and failed to react to its 326 // cancellation. Upon return, an attempt is performed to destroy the cluster used 327 // by this worker. If an error is returned, we might have "leaked" cpu quota 328 // because the cluster destruction might have failed but we've still released 329 // the quota. Also, we might have "leaked" a test goroutine (in the test 330 // nonresponsive to timeout case) which might still be running and doing 331 // arbitrary things to the cluster it was using. 332 // 333 // Args: 334 // name: The worker's name, to be used as a prefix for log messages. 335 // artifactsRootDir: The artifacts dir. Each test's logs are going to be under a 336 // run_<n> dir. If empty, test log files will not be created. 337 // testRunnerLogPath: The path to the test runner's log. It will be copied to 338 // failing tests' artifacts dir if running under TeamCity. 339 // stdout: The Writer to use for messages that need to go to stdout (e.g. the 340 // "=== RUN" and "--- FAIL" lines). 341 // teeOpt: The teeing option for future test loggers. 342 // l: The logger to use for more verbose messages. 343 func (r *testRunner) runWorker( 344 ctx context.Context, 345 name string, 346 work *workPool, 347 qp *quotapool.IntPool, 348 interrupt <-chan struct{}, 349 debug bool, 350 artifactsRootDir string, 351 testRunnerLogPath string, 352 teeOpt teeOptType, 353 stdout io.Writer, 354 allocateCluster clusterAllocatorFn, 355 l *logger, 356 ) error { 357 ctx = logtags.AddTag(ctx, name, nil /* value */) 358 wStatus := r.addWorker(ctx, name) 359 defer func() { 360 r.removeWorker(ctx, name) 361 }() 362 363 var c *cluster // The cluster currently being used. 364 var err error 365 // When this method returns we'll destroy the cluster we had at the time. 366 // Note that, if debug was set, c has been set to nil. 367 defer func() { 368 wStatus.SetTest(nil /* test */, testToRunRes{noWork: true}) 369 wStatus.SetStatus("worker done") 370 wStatus.SetCluster(nil) 371 372 if c == nil { 373 return 374 } 375 doDestroy := ctx.Err() == nil 376 if doDestroy { 377 l.PrintfCtx(ctx, "Worker exiting; destroying cluster.") 378 c.Destroy(context.Background(), closeLogger, l) 379 } else { 380 l.PrintfCtx(ctx, "Worker exiting with canceled ctx. Not destroying cluster.") 381 } 382 }() 383 384 // Loop until there's no more work in the pool, we get interrupted, or an 385 // error occurs. 386 for { 387 select { 388 case <-interrupt: 389 l.ErrorfCtx(ctx, "worker detected interruption") 390 return errors.Errorf("interrupted") 391 default: 392 if ctx.Err() != nil { 393 // The context has been canceled. No need to continue. 394 return errors.Wrap(ctx.Err(), "worker ctx done") 395 } 396 } 397 398 if c != nil { 399 if _, ok := c.spec.ReusePolicy.(reusePolicyNone); ok { 400 wStatus.SetStatus("destroying cluster") 401 // We use a context that can't be canceled for the Destroy(). 402 c.Destroy(ctx, closeLogger, l) 403 c = nil 404 } 405 } 406 var testToRun testToRunRes 407 wStatus.SetTest(nil /* test */, testToRunRes{}) 408 wStatus.SetStatus("getting work") 409 testToRun, c, err = r.getWork( 410 ctx, work, qp, c, interrupt, l, 411 getWorkCallbacks{ 412 createCluster: func(ctx context.Context, ttr testToRunRes) (*cluster, error) { 413 wStatus.SetTest(nil /* test */, ttr) 414 return allocateCluster(ctx, ttr.spec, ttr.alloc, artifactsRootDir, wStatus) 415 }, 416 onDestroy: func() { 417 wStatus.SetCluster(nil) 418 }, 419 }) 420 if err != nil || testToRun.noWork { 421 return err 422 } 423 c.status("running test") 424 425 // Prepare the test's logger. 426 logPath := "" 427 var artifactsDir string 428 var artifactsSpec string 429 if artifactsRootDir != "" { 430 escapedTestName := teamCityNameEscape(testToRun.spec.Name) 431 runSuffix := "run_" + strconv.Itoa(testToRun.runNum) 432 433 base := filepath.Join(artifactsRootDir, escapedTestName) 434 435 artifactsDir = filepath.Join(base, runSuffix) 436 logPath = filepath.Join(artifactsDir, "test.log") 437 438 // Map artifacts/TestFoo/** => TestFoo/**, i.e. collect the artifacts 439 // for this test exactly as they are laid out on disk (when the time 440 // comes). 441 artifactsSpec = fmt.Sprintf("%s/** => %s", base, escapedTestName) 442 } 443 testL, err := rootLogger(logPath, teeOpt) 444 if err != nil { 445 return err 446 } 447 t := &test{ 448 spec: &testToRun.spec, 449 buildVersion: r.buildVersion, 450 artifactsDir: artifactsDir, 451 artifactsSpec: artifactsSpec, 452 l: testL, 453 } 454 // Tell the cluster that, from now on, it will be run "on behalf of this 455 // test". 456 c.setTest(t) 457 wStatus.SetCluster(c) 458 wStatus.SetTest(t, testToRun) 459 wStatus.SetStatus("running test") 460 461 // Now run the test. 462 l.PrintfCtx(ctx, "starting test: %s:%d", testToRun.spec.Name, testToRun.runNum) 463 var success bool 464 success, err = r.runTest(ctx, t, testToRun.runNum, c, testRunnerLogPath, stdout, testL) 465 if err != nil { 466 shout(ctx, l, stdout, "test returned error: %s: %s", t.Name(), err) 467 // Mark the test as failed if it isn't already. 468 if !t.Failed() { 469 t.printAndFail(0 /* skip */, err) 470 } 471 } else { 472 msg := "test passed" 473 if !success { 474 msg = fmt.Sprintf("test failed: %s (run %d)", t.Name(), testToRun.runNum) 475 } 476 l.PrintfCtx(ctx, msg) 477 } 478 testL.close() 479 if err != nil || t.Failed() { 480 failureMsg := fmt.Sprintf("%s (%d) - ", testToRun.spec.Name, testToRun.runNum) 481 if err != nil { 482 failureMsg += fmt.Sprintf("%+v", err) 483 } else { 484 failureMsg += t.FailureMsg() 485 } 486 487 if debug { 488 // Save the cluster for future debugging. 489 c.Save(ctx, failureMsg, l) 490 491 // Continue with a fresh cluster. 492 c = nil 493 } else { 494 // On any test failure or error, we destroy the cluster. We could be 495 // more selective, but this sounds safer. 496 l.PrintfCtx(ctx, "destroying cluster %s because: %s", c, failureMsg) 497 c.Destroy(ctx, closeLogger, l) 498 c = nil 499 } 500 501 if err != nil { 502 return err 503 } 504 } else { 505 // Upon success fetch the perf artifacts from the remote hosts. 506 getPerfArtifacts(ctx, l, c, t) 507 } 508 } 509 } 510 511 // getPerfArtifacts retrieves the perf artifacts for the test. 512 // If there's an error, oh well, don't do anything rash like fail a test 513 // which already passed. 514 func getPerfArtifacts(ctx context.Context, l *logger, c *cluster, t *test) { 515 g := ctxgroup.WithContext(ctx) 516 fetchNode := func(node int) func(context.Context) error { 517 return func(ctx context.Context) error { 518 testCmd := `'PERF_ARTIFACTS="` + perfArtifactsDir + `" 519 if [[ -d "${PERF_ARTIFACTS}" ]]; then 520 echo true 521 elif [[ -e "${PERF_ARTIFACTS}" ]]; then 522 ls -la "${PERF_ARTIFACTS}" 523 exit 1 524 else 525 echo false 526 fi'` 527 out, err := c.RunWithBuffer(ctx, l, c.Node(node), "bash", "-c", testCmd) 528 if err != nil { 529 return errors.Wrapf(err, "failed to check for perf artifacts: %v", string(out)) 530 } 531 switch out := strings.TrimSpace(string(out)); out { 532 case "true": 533 dst := fmt.Sprintf("%s/%d.%s", t.artifactsDir, node, perfArtifactsDir) 534 return c.Get(ctx, l, perfArtifactsDir, dst, c.Node(node)) 535 case "false": 536 l.PrintfCtx(ctx, "no perf artifacts exist on node %v", c.Node(node)) 537 return nil 538 default: 539 return errors.Errorf("unexpected output when checking for perf artifacts: %s", out) 540 } 541 } 542 } 543 for _, i := range c.All() { 544 g.GoCtx(fetchNode(i)) 545 } 546 if err := g.Wait(); err != nil { 547 l.PrintfCtx(ctx, "failed to get perf artifacts: %v", err) 548 } 549 } 550 551 func allStacks() []byte { 552 // Collect up to 5mb worth of stacks. 553 b := make([]byte, 5*(1<<20)) 554 return b[:runtime.Stack(b, true /* all */)] 555 } 556 557 // An error is returned if the test is still running (on another goroutine) when 558 // this returns. This happens when the test doesn't respond to cancellation. 559 // Returns true if the test is considered to have passed, false otherwise. 560 // 561 // Args: 562 // c: The cluster on which the test will run. runTest() does not wipe or destroy 563 // the cluster. 564 // testRunnerLogPath: The path to the test runner's log. It will be copied to 565 // the test's artifacts dir if the test fails and we're running under 566 // TeamCity. 567 func (r *testRunner) runTest( 568 ctx context.Context, 569 t *test, 570 runNum int, 571 c *cluster, 572 testRunnerLogPath string, 573 stdout io.Writer, 574 l *logger, 575 ) (bool, error) { 576 if t.spec.Skip != "" { 577 return false, fmt.Errorf("can't run skipped test: %s: %s", t.Name(), t.spec.Skip) 578 } 579 580 if teamCity { 581 shout(ctx, l, stdout, "##teamcity[testStarted name='%s' flowId='%s']", t.Name(), t.Name()) 582 } else { 583 shout(ctx, l, stdout, "=== RUN %s", t.Name()) 584 } 585 586 r.status.Lock() 587 r.status.running[t] = struct{}{} 588 r.status.Unlock() 589 590 t.runner = callerName() 591 t.runnerID = goid.Get() 592 593 defer func() { 594 t.end = timeutil.Now() 595 596 if err := recover(); err != nil { 597 t.mu.Lock() 598 t.mu.failed = true 599 t.mu.output = append(t.mu.output, t.decorate(0 /* skip */, fmt.Sprint(err))...) 600 t.mu.Unlock() 601 } 602 603 t.mu.Lock() 604 t.mu.done = true 605 t.mu.Unlock() 606 607 durationStr := fmt.Sprintf("%.2fs", t.duration().Seconds()) 608 if t.Failed() { 609 t.mu.Lock() 610 output := fmt.Sprintf("test artifacts and logs in: %s\n", t.ArtifactsDir()) + string(t.mu.output) 611 failLoc := t.mu.failLoc 612 t.mu.Unlock() 613 614 if teamCity { 615 shout(ctx, l, stdout, "##teamcity[testFailed name='%s' details='%s' flowId='%s']", 616 t.Name(), teamCityEscape(output), t.Name(), 617 ) 618 619 // Copy a snapshot of the testrunner's log to the test's artifacts dir 620 // so that we collect it below. 621 cp := exec.Command("cp", testRunnerLogPath, t.artifactsDir) 622 if err := cp.Run(); err != nil { 623 l.ErrorfCtx(ctx, "failed to copy test runner's logs to test artifacts: %s", err) 624 } 625 } 626 627 shout(ctx, l, stdout, "--- FAIL: %s (%s)\n%s", t.Name(), durationStr, output) 628 // NB: check NodeCount > 0 to avoid posting issues from this pkg's unit tests. 629 if issues.CanPost() && t.spec.Run != nil && t.spec.Cluster.NodeCount > 0 { 630 authorEmail := getAuthorEmail(t.spec.Tags, failLoc.file, failLoc.line) 631 if authorEmail == "" { 632 ownerInfo, ok := roachtestOwners[t.spec.Owner] 633 if ok { 634 authorEmail = ownerInfo.ContactEmail 635 } 636 } 637 branch := "<unknown branch>" 638 if b := os.Getenv("TC_BUILD_BRANCH"); b != "" { 639 branch = b 640 } 641 msg := fmt.Sprintf("The test failed on branch=%s, cloud=%s:\n%s", 642 branch, cloud, output) 643 artifacts := fmt.Sprintf("/%s", t.Name()) 644 645 req := issues.PostRequest{ 646 // TODO(tbg): actually use this as a template. 647 TitleTemplate: fmt.Sprintf("roachtest: %s failed", t.Name()), 648 // TODO(tbg): make a template better adapted to roachtest. 649 BodyTemplate: issues.UnitTestFailureBody, 650 PackageName: "roachtest", 651 TestName: t.Name(), 652 Message: msg, 653 Artifacts: artifacts, 654 AuthorEmail: authorEmail, 655 // Issues posted from roachtest are identifiable as such and 656 // they are also release blockers (this label may be removed 657 // by a human upon closer investigation). 658 ExtraLabels: []string{"O-roachtest", "release-blocker"}, 659 } 660 if err := issues.Post( 661 context.Background(), 662 req, 663 ); err != nil { 664 shout(ctx, l, stdout, "failed to post issue: %s", err) 665 } 666 } 667 } else { 668 shout(ctx, l, stdout, "--- PASS: %s (%s)", t.Name(), durationStr) 669 // If `##teamcity[testFailed ...]` is not present before `##teamCity[testFinished ...]`, 670 // TeamCity regards the test as successful. 671 } 672 673 if teamCity { 674 shout(ctx, l, stdout, "##teamcity[testFinished name='%s' flowId='%s']", t.Name(), t.Name()) 675 676 if t.artifactsSpec != "" { 677 // Tell TeamCity to collect this test's artifacts now. The TC job 678 // also collects the artifacts directory wholesale at the end, but 679 // here we make sure that the artifacts for any test that has already 680 // finished are available in the UI even before the job as a whole 681 // has completed. We're using the exact same destination to avoid 682 // duplication of any of the artifacts. 683 shout(ctx, l, stdout, "##teamcity[publishArtifacts '%s']", t.artifactsSpec) 684 } 685 } 686 687 r.recordTestFinish(completedTestInfo{ 688 test: t.Name(), 689 run: runNum, 690 start: t.start, 691 end: t.end, 692 pass: !t.Failed(), 693 failure: t.FailureMsg(), 694 }) 695 r.status.Lock() 696 delete(r.status.running, t) 697 // Only include tests with a Run function in the summary output. 698 if t.spec.Run != nil { 699 if t.Failed() { 700 r.status.fail[t] = struct{}{} 701 } else if t.spec.Skip == "" { 702 r.status.pass[t] = struct{}{} 703 } else { 704 r.status.skip[t] = struct{}{} 705 } 706 } 707 r.status.Unlock() 708 }() 709 710 t.start = timeutil.Now() 711 712 timeout := 10 * time.Hour 713 if t.spec.Timeout != 0 { 714 timeout = t.spec.Timeout 715 } 716 // Make sure the cluster has enough life left for the test plus enough headroom 717 // after the test finishes so that the next test can be selected. If it 718 // doesn't, extend it. 719 minExp := timeutil.Now().Add(timeout + time.Hour) 720 if c.expiration.Before(minExp) { 721 extend := minExp.Sub(c.expiration) 722 l.PrintfCtx(ctx, "cluster needs to survive until %s, but has expiration: %s. Extending.", 723 minExp, c.expiration) 724 if err := c.Extend(ctx, extend, l); err != nil { 725 t.printfAndFail(0 /* skip */, "failed to extend cluster: %s", err) 726 return false, nil 727 } 728 } 729 730 runCtx, cancel := context.WithCancel(ctx) 731 defer cancel() 732 t.mu.Lock() 733 // t.Fatal() will cancel this context. 734 t.mu.cancel = cancel 735 t.mu.Unlock() 736 737 // We run the actual test in a different goroutine because it might call 738 // t.Fatal() which kills the goroutine, and also because we want to enforce a 739 // timeout. 740 success := false 741 done := make(chan struct{}) 742 go func() { 743 defer close(done) // closed only after we've grabbed the debug info below 744 745 // This is the call to actually run the test. 746 defer func() { 747 if r := recover(); r != nil { 748 // TODO(andreimatei): prevent the cluster from being reused. 749 t.Fatalf("test panicked: %v", r) 750 } 751 }() 752 753 t.spec.Run(runCtx, t, c) 754 }() 755 756 select { 757 case <-done: 758 case <-time.After(timeout): 759 // Timeouts are often opaque. Improve our changes by dumping the stack 760 // so that at least we can piece together what the test is trying to 761 // do at this very moment. 762 // In large roachtest runs, this will dump everyone else's stack as well, 763 // but this is just hard to avoid. Moving to a worker model in which 764 // each roachtest is spawned off as a separate process would address 765 // this at the expense of ease of communication between the runner and 766 // the test. 767 // 768 // Do this before we cancel the context, which might (hopefully) unblock 769 // the test. We want to see where it got stuck. 770 const stacksFile = "__stacks" 771 if cl, err := t.l.ChildLogger(stacksFile, quietStderr, quietStdout); err == nil { 772 cl.PrintfCtx(ctx, "all stacks:\n\n%s\n", allStacks()) 773 t.l.PrintfCtx(ctx, "dumped stacks to %s", stacksFile) 774 } 775 // Now kill anything going on in this cluster while collecting stacks 776 // to the logs, to get the server side of the hang. 777 // 778 // TODO(tbg): send --sig=3 followed by a hard kill after we've fixed 779 // https://github.com/cockroachdb/cockroach/issues/45875. 780 // Signal 11 will dump stacks, but it might be confusing to folks 781 // who debug from the artifacts only. 782 // 783 // Don't use surrounding context, which are likely already canceled. 784 if nodes := c.All(); len(nodes) > 0 { // avoid tests 785 innerCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 786 _ = c.StopE(innerCtx, c.All(), stopArgs("--sig=11")) 787 cancel() 788 } 789 790 // Mark the test as failed (which will also cancel its context). Then 791 // we'll wait up to 5 minutes in the hope that the test reacts either to 792 // the ctx cancelation or to the fact that it was marked as failed (and 793 // all processes nuked out from under it). 794 // If that happens, great - we return normally and so the cluster can be 795 // reused. It the test does not react to anything, then we return an 796 // error, which will cause the caller to stop everything and destroy 797 // this cluster (as well as all the others). The cluster cannot be 798 // reused since we have a runaway test goroutine that's presumably going 799 // to continue using the cluster. 800 t.printfAndFail(0 /* skip */, "test timed out (%s)", timeout) 801 select { 802 case <-done: 803 if success { 804 panic("expected success=false after a timeout") 805 } 806 case <-time.After(5 * time.Minute): 807 // We really shouldn't get here unless the test code somehow managed 808 // to deadlock without blocking on anything remote - since we killed 809 // everything. 810 const msg = "test timed out and afterwards failed to respond to cancelation" 811 t.l.PrintfCtx(ctx, msg) 812 r.collectClusterLogs(ctx, c, t.l) 813 // We return an error here because the test goroutine is still running, so 814 // we want to alert the caller of this unusual situation. 815 return false, errors.New(msg) 816 } 817 } 818 819 // Detect replica divergence (i.e. ranges in which replicas have arrived 820 // at the same log position with different states). 821 c.FailOnReplicaDivergence(ctx, t) 822 // Detect dead nodes in an inner defer. Note that this will call 823 // t.printfAndFail() when appropriate, which will cause the code below to 824 // enter the t.Failed() branch. 825 c.FailOnDeadNodes(ctx, t) 826 827 if t.Failed() { 828 r.collectClusterLogs(ctx, c, t.l) 829 return false, nil 830 } 831 return true, nil 832 } 833 834 func (r *testRunner) collectClusterLogs(ctx context.Context, c *cluster, l *logger) { 835 // NB: fetch the logs even when we have a debug zip because 836 // debug zip can't ever get the logs for down nodes. 837 // We only save artifacts for failed tests in CI, so this 838 // duplication is acceptable. 839 // NB: fetch the logs *first* in case one of the other steps 840 // below has problems. For example, `debug zip` is known to 841 // hang sometimes at the time of writing, see: 842 // https://github.com/cockroachdb/cockroach/issues/39620 843 l.PrintfCtx(ctx, "collecting cluster logs") 844 if err := c.FetchLogs(ctx); err != nil { 845 l.Printf("failed to download logs: %s", err) 846 } 847 if err := c.FetchDmesg(ctx); err != nil { 848 l.Printf("failed to fetch dmesg: %s", err) 849 } 850 if err := c.FetchJournalctl(ctx); err != nil { 851 l.Printf("failed to fetch journalctl: %s", err) 852 } 853 if err := c.FetchCores(ctx); err != nil { 854 l.Printf("failed to fetch cores: %s", err) 855 } 856 if err := c.CopyRoachprodState(ctx); err != nil { 857 l.Printf("failed to copy roachprod state: %s", err) 858 } 859 if err := c.FetchDebugZip(ctx); err != nil { 860 l.Printf("failed to collect zip: %s", err) 861 } 862 } 863 864 func callerName() string { 865 // Make room for the skip PC. 866 var pc [2]uintptr 867 n := runtime.Callers(2, pc[:]) // runtime.Callers + callerName 868 if n == 0 { 869 panic("zero callers found") 870 } 871 frames := runtime.CallersFrames(pc[:n]) 872 frame, _ := frames.Next() 873 return frame.Function 874 } 875 876 // generateReport produces the final pass/fail line and produces a slack report 877 // if configured. 878 func (r *testRunner) generateReport() string { 879 r.status.Lock() 880 defer r.status.Unlock() 881 postSlackReport(r.status.pass, r.status.fail, r.status.skip) 882 883 fails := len(r.status.fail) 884 var msg string 885 if fails > 0 { 886 msg = fmt.Sprintf("FAIL (%d fails)\n", fails) 887 } else { 888 msg = "PASS" 889 } 890 return msg 891 } 892 893 type getWorkCallbacks struct { 894 createCluster func(context.Context, testToRunRes) (*cluster, error) 895 onDestroy func() 896 } 897 898 // getWork selects the next test to run and creates a suitable cluster for it if 899 // need be. If a new cluster needs to be created, the method blocks until there 900 // are enough resources available to run it. 901 // getWork takes in a cluster; if not nil, tests that can reuse it are 902 // preferred. If a test that can reuse it is not found (or if there's no more 903 // work), the cluster is destroyed (and so its resources are released). 904 // 905 // If the cluster is to be reused, getWork() wipes it. 906 func (r *testRunner) getWork( 907 ctx context.Context, 908 work *workPool, 909 qp *quotapool.IntPool, 910 c *cluster, 911 interrupt <-chan struct{}, 912 l *logger, 913 callbacks getWorkCallbacks, 914 ) (testToRunRes, *cluster, error) { 915 916 select { 917 case <-interrupt: 918 return testToRunRes{}, nil, fmt.Errorf("interrupted") 919 default: 920 } 921 922 testToRun, err := work.getTestToRun(ctx, c, qp, r.cr, callbacks.onDestroy, l) 923 if err != nil { 924 return testToRunRes{}, nil, err 925 } 926 if !testToRun.noWork { 927 l.PrintfCtx(ctx, "Selected test: %s run: %d.", testToRun.spec.Name, testToRun.runNum) 928 } 929 // Are we done? 930 if testToRun.noWork { 931 return testToRun, nil, nil 932 } 933 934 // Create a cluster, if we no longer have one. 935 if testToRun.canReuseCluster { 936 l.PrintfCtx(ctx, "Using existing cluster: %s. Wiping", c.name) 937 if err := c.WipeE(ctx, l); err != nil { 938 return testToRunRes{}, nil, err 939 } 940 if err := c.RunL(ctx, l, c.All(), "rm -rf "+perfArtifactsDir); err != nil { 941 return testToRunRes{}, nil, errors.Wrapf(err, "failed to remove perf artifacts dir") 942 } 943 // Overwrite the spec of the cluster with the one coming from the test. In 944 // particular, this overwrites the reuse policy to reflect what the test 945 // intends to do with it. 946 c.spec = testToRun.spec.Cluster 947 } else { 948 var err error 949 c, err = callbacks.createCluster(ctx, testToRun) 950 if err != nil { 951 return testToRunRes{}, nil, err 952 } 953 } 954 return testToRun, c, nil 955 } 956 957 // addWorker updates the bookkeeping for one more worker. 958 func (r *testRunner) addWorker(ctx context.Context, name string) *workerStatus { 959 r.workersMu.Lock() 960 defer r.workersMu.Unlock() 961 w := &workerStatus{name: name} 962 if _, ok := r.workersMu.workers[name]; ok { 963 log.Fatalf(ctx, "worker %q already exists", name) 964 } 965 r.workersMu.workers[name] = w 966 return w 967 } 968 969 // removeWorker deletes the bookkepping for a worker that has finished running. 970 func (r *testRunner) removeWorker(ctx context.Context, name string) { 971 r.workersMu.Lock() 972 delete(r.workersMu.workers, name) 973 r.workersMu.Unlock() 974 } 975 976 // runHTTPServer starts a server running in the background. 977 // 978 // httpPort: The port on which to serve the web interface. Pass 0 for allocating 979 // a port automatically (which will be printed to stdout). 980 func (r *testRunner) runHTTPServer(httpPort int, stdout io.Writer) error { 981 http.HandleFunc("/", r.serveHTTP) 982 // Run an http server in the background. 983 // We handle the case where httpPort is 0, which means we automatically 984 // allocate a port. 985 listener, err := net.Listen("tcp", fmt.Sprintf(":%d", httpPort)) 986 if err != nil { 987 return err 988 } 989 httpPort = listener.Addr().(*net.TCPAddr).Port 990 go func() { 991 if err := http.Serve(listener, nil /* handler */); err != nil { 992 panic(err) 993 } 994 }() 995 fmt.Fprintf(stdout, "HTTP server listening on all network interfaces, port %d.\n", httpPort) 996 return nil 997 } 998 999 // serveHTTP is the handler for the test runner's web server. 1000 func (r *testRunner) serveHTTP(wr http.ResponseWriter, req *http.Request) { 1001 fmt.Fprintf(wr, "<html><body>") 1002 fmt.Fprintf(wr, "<a href='debug/pprof'>pprof</a>") 1003 fmt.Fprintf(wr, "<p>") 1004 // Print the workers report. 1005 fmt.Fprintf(wr, "<h2>Workers:</h2>") 1006 fmt.Fprintf(wr, `<table border='1'> 1007 <tr><th>Worker</th> 1008 <th>Worker Status</th> 1009 <th>Test</th> 1010 <th>Cluster</th> 1011 <th>Cluster reused</th> 1012 <th>Test Status</th> 1013 </tr>`) 1014 r.workersMu.Lock() 1015 workers := make([]*workerStatus, len(r.workersMu.workers)) 1016 i := 0 1017 for _, w := range r.workersMu.workers { 1018 workers[i] = w 1019 i++ 1020 } 1021 r.workersMu.Unlock() 1022 sort.Slice(workers, func(i int, j int) bool { 1023 l := workers[i] 1024 r := workers[j] 1025 return strings.Compare(l.name, r.name) < 0 1026 }) 1027 for _, w := range workers { 1028 var testName string 1029 ttr := w.TestToRun() 1030 clusterReused := "" 1031 if ttr.noWork { 1032 testName = "done" 1033 } else if ttr.spec.Name == "" { 1034 testName = "N/A" 1035 } else { 1036 testName = fmt.Sprintf("%s (run %d)", ttr.spec.Name, ttr.runNum) 1037 if ttr.canReuseCluster { 1038 clusterReused = "yes" 1039 } else { 1040 clusterReused = "no" 1041 } 1042 } 1043 var clusterName string 1044 if w.Cluster() != nil { 1045 clusterName = w.Cluster().name 1046 } 1047 t := w.Test() 1048 testStatus := "N/A" 1049 if t != nil { 1050 testStatus = t.GetStatus() 1051 } 1052 fmt.Fprintf(wr, "<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n", 1053 w.name, w.Status(), testName, clusterName, clusterReused, testStatus) 1054 } 1055 fmt.Fprintf(wr, "</table>") 1056 1057 // Print the finished tests report. 1058 fmt.Fprintf(wr, "<p>") 1059 fmt.Fprintf(wr, "<h2>Finished tests:</h2>") 1060 fmt.Fprintf(wr, `<table border='1'> 1061 <tr><th>Test</th> 1062 <th>Status</th> 1063 <th>Duration</th> 1064 </tr>`) 1065 for _, t := range r.getCompletedTests() { 1066 name := fmt.Sprintf("%s (run %d)", t.test, t.run) 1067 status := "PASS" 1068 if !t.pass { 1069 status = "FAIL " + t.failure 1070 } 1071 duration := fmt.Sprintf("%s (%s - %s)", t.end.Sub(t.start), t.start, t.end) 1072 fmt.Fprintf(wr, "<tr><td>%s</td><td>%s</td><td>%s</td><tr/>", name, status, duration) 1073 } 1074 fmt.Fprintf(wr, "</table>") 1075 1076 // Print the saved clusters report. 1077 fmt.Fprintf(wr, "<p>") 1078 fmt.Fprintf(wr, "<h2>Clusters left alive for further debugging "+ 1079 "(if --debug was specified):</h2>") 1080 fmt.Fprintf(wr, `<table border='1'> 1081 <tr><th>Cluster</th> 1082 <th>Test</th> 1083 </tr>`) 1084 for _, c := range r.cr.savedClusters() { 1085 fmt.Fprintf(wr, "<tr><td>%s</td><td>%s</td><tr/>", c.name, c.savedMsg) 1086 } 1087 fmt.Fprintf(wr, "</table>") 1088 1089 fmt.Fprintf(wr, "<p>") 1090 fmt.Fprintf(wr, "<h2>Tests left:</h2>") 1091 fmt.Fprintf(wr, `<table border='1'> 1092 <tr><th>Test</th> 1093 <th>Runs</th> 1094 </tr>`) 1095 for _, t := range r.work.workRemaining() { 1096 fmt.Fprintf(wr, "<tr><td>%s</td><td>%d</td><tr/>", t.spec.Name, t.count) 1097 } 1098 1099 fmt.Fprintf(wr, "</body></html>") 1100 } 1101 1102 // recordTestFinish updated bookkeeping when a test finishes. 1103 func (r *testRunner) recordTestFinish(info completedTestInfo) { 1104 r.completedTestsMu.Lock() 1105 r.completedTestsMu.completed = append(r.completedTestsMu.completed, info) 1106 r.completedTestsMu.Unlock() 1107 } 1108 1109 // getCompletedTests returns info on all tests that finished running. 1110 func (r *testRunner) getCompletedTests() []completedTestInfo { 1111 r.completedTestsMu.Lock() 1112 defer r.completedTestsMu.Unlock() 1113 res := make([]completedTestInfo, len(r.completedTestsMu.completed)) 1114 copy(res, r.completedTestsMu.completed) 1115 return res 1116 } 1117 1118 // completedTestInfo represents information on a completed test run. 1119 type completedTestInfo struct { 1120 test string 1121 run int 1122 start time.Time 1123 end time.Time 1124 pass bool 1125 failure string 1126 } 1127 1128 // PredecessorVersion returns a recent predecessor of the build version (i.e. 1129 // the build tag of the main binary). For example, if the running binary is from 1130 // the master branch prior to releasing 19.2.0, this will return a recent 1131 // (ideally though not necessarily the latest) 19.1 patch release. 1132 func PredecessorVersion(buildVersion version.Version) (string, error) { 1133 if buildVersion == (version.Version{}) { 1134 return "", errors.Errorf("buildVersion not set") 1135 } 1136 1137 buildVersionMajorMinor := fmt.Sprintf("%d.%d", buildVersion.Major(), buildVersion.Minor()) 1138 1139 // NB: you can update the values in this map to point at newer patch 1140 // releases. You will need to run acceptance/version-upgrade with the 1141 // checkpoint option enabled to create the missing store directory fixture 1142 // (see runVersionUpgrade). The same is true for adding a new key to this 1143 // map. 1144 verMap := map[string]string{ 1145 "20.2": "20.1.0", 1146 "20.1": "19.2.7", 1147 "19.2": "19.1.5", 1148 "19.1": "2.1.9", 1149 "2.2": "2.1.9", 1150 "2.1": "2.0.7", 1151 } 1152 v, ok := verMap[buildVersionMajorMinor] 1153 if !ok { 1154 return "", errors.Errorf("prev version not set for version: %s", buildVersionMajorMinor) 1155 } 1156 return v, nil 1157 } 1158 1159 type workerErrors struct { 1160 mu struct { 1161 syncutil.Mutex 1162 errs []error 1163 } 1164 } 1165 1166 func (we *workerErrors) AddErr(err error) { 1167 we.mu.Lock() 1168 defer we.mu.Unlock() 1169 we.mu.errs = append(we.mu.errs, err) 1170 } 1171 1172 func (we *workerErrors) Err() error { 1173 we.mu.Lock() 1174 defer we.mu.Unlock() 1175 if len(we.mu.errs) == 0 { 1176 return nil 1177 } 1178 // TODO(andrei): Maybe we should do something other than return the first 1179 // error... 1180 return we.mu.errs[0] 1181 }