github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/test.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package main 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "io" 18 // For the debug http handlers. 19 _ "net/http/pprof" 20 "os/exec" 21 "regexp" 22 "runtime" 23 "strings" 24 "time" 25 26 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 27 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 28 "github.com/cockroachdb/cockroach/pkg/util/version" 29 "github.com/petermattis/goid" 30 ) 31 32 type testSpec struct { 33 Skip string // if non-empty, test will be skipped 34 // When Skip is set, this can contain more text to be printed in the logs 35 // after the "--- SKIP" line. 36 SkipDetails string 37 38 Name string 39 // Owner is the name of the team responsible for signing off on failures of 40 // this test that happen in the release process. This must be one of a limited 41 // set of values (the keys in the roachtestTeams map). 42 Owner Owner 43 // The maximum duration the test is allowed to run before it is considered 44 // failed. If not specified, the default timeout is 10m before the test's 45 // associated cluster expires. The timeout is always truncated to 10m before 46 // the test's cluster expires. 47 Timeout time.Duration 48 // MinVersion indicates the minimum cockroach version that is required for 49 // the test to be run. If MinVersion is less than the version specified 50 // --cockroach-version, Skip will be populated causing the test to be 51 // skipped. 52 MinVersion string 53 minVersion *version.Version 54 // Tags is a set of tags associated with the test that allow grouping 55 // tests. If no tags are specified, the set ["default"] is automatically 56 // given. 57 Tags []string 58 // Cluster provides the specification for the cluster to use for the test. 59 Cluster clusterSpec 60 61 // UseIOBarrier controls the local-ssd-no-ext4-barrier flag passed to 62 // roachprod when creating a cluster. If set, the flag is not passed, and so 63 // you get durable writes. If not set (the default!), the filesystem is 64 // mounted without the barrier. 65 // 66 // The default (false) is chosen because it the no-barrier option is needed 67 // explicitly by some tests (particularly benchmarks, ironically, since they'd 68 // rather measure other things than I/O) and the vast majority of other tests 69 // don't care - there's no durability across machine crashes that roachtests 70 // care about. 71 UseIOBarrier bool 72 73 // Run is the test function. 74 Run func(ctx context.Context, t *test, c *cluster) 75 } 76 77 // perfArtifactsDir is the directory on cluster nodes in which perf artifacts 78 // reside. Upon success this directory is copied into test artifactsDir from 79 // each node in the cluster. 80 const perfArtifactsDir = "perf" 81 82 // matchOrSkip returns true if the filter matches the test. If the filter does 83 // not match the test because the tag filter does not match, the test is 84 // matched, but marked as skipped. 85 func (t *testSpec) matchOrSkip(filter *testFilter) bool { 86 if !filter.name.MatchString(t.Name) { 87 return false 88 } 89 if len(t.Tags) == 0 { 90 if !filter.tag.MatchString("default") { 91 t.Skip = fmt.Sprintf("%s does not match [default]", filter.rawTag) 92 } 93 return true 94 } 95 for _, t := range t.Tags { 96 if filter.tag.MatchString(t) { 97 return true 98 } 99 } 100 t.Skip = fmt.Sprintf("%s does not match %s", filter.rawTag, t.Tags) 101 return true 102 } 103 104 type testStatus struct { 105 msg string 106 time time.Time 107 progress float64 108 } 109 110 type test struct { 111 spec *testSpec 112 113 // buildVersion is the version of the Cockroach binary that the test will run 114 // against. 115 buildVersion version.Version 116 117 // l is the logger that the test will use for its output. 118 l *logger 119 120 runner string 121 runnerID int64 122 start time.Time 123 end time.Time 124 125 // artifactsDir is the path to the directory holding all the artifacts for 126 // this test. It will contain a test.log file and cluster logs. 127 artifactsDir string 128 // artifactsSpec is a TeamCity artifacts spec used to publish this test's 129 // artifacts. See: 130 // https://www.jetbrains.com/help/teamcity/2019.1/configuring-general-settings.html#Artifact-Paths 131 artifactsSpec string 132 133 mu struct { 134 syncutil.RWMutex 135 done bool 136 failed bool 137 // cancel, if set, is called from the t.Fatal() family of functions when the 138 // test is being marked as failed (i.e. when the failed field above is also 139 // set). This is used to cancel the context passed to t.spec.Run(), so async 140 // test goroutines can be notified. 141 cancel func() 142 failLoc struct { 143 file string 144 line int 145 } 146 failureMsg string 147 status map[int64]testStatus 148 output []byte 149 } 150 } 151 152 func (t *test) Name() string { 153 return t.spec.Name 154 } 155 156 func (t *test) logger() *logger { 157 return t.l 158 } 159 160 func (t *test) status(ctx context.Context, id int64, args ...interface{}) { 161 t.mu.Lock() 162 defer t.mu.Unlock() 163 164 if t.mu.status == nil { 165 t.mu.status = make(map[int64]testStatus) 166 } 167 if len(args) == 0 { 168 delete(t.mu.status, id) 169 return 170 } 171 msg := fmt.Sprint(args...) 172 t.mu.status[id] = testStatus{ 173 msg: msg, 174 time: timeutil.Now(), 175 } 176 if !t.l.closed() { 177 if id == t.runnerID { 178 t.l.PrintfCtxDepth(ctx, 2, "test status: %s", msg) 179 } else { 180 t.l.PrintfCtxDepth(ctx, 2, "test worker status: %s", msg) 181 } 182 } 183 } 184 185 // Status sets the main status message for the test. When called from the main 186 // test goroutine (i.e. the goroutine on which testSpec.Run is invoked), this 187 // is equivalent to calling WorkerStatus. If no arguments are specified, the 188 // status message is erased. 189 func (t *test) Status(args ...interface{}) { 190 t.status(context.TODO(), t.runnerID, args...) 191 } 192 193 // GetStatus returns the status of the tests's main goroutine. 194 func (t *test) GetStatus() string { 195 t.mu.Lock() 196 defer t.mu.Unlock() 197 status, ok := t.mu.status[t.runnerID] 198 if ok { 199 return fmt.Sprintf("%s (set %s ago)", status.msg, timeutil.Now().Sub(status.time).Round(time.Second)) 200 } 201 return "N/A" 202 } 203 204 // WorkerStatus sets the status message for a worker goroutine associated with 205 // the test. The status message should be cleared before the goroutine exits by 206 // calling WorkerStatus with no arguments. 207 func (t *test) WorkerStatus(args ...interface{}) { 208 t.status(context.TODO(), goid.Get(), args...) 209 } 210 211 func (t *test) progress(id int64, frac float64) { 212 t.mu.Lock() 213 defer t.mu.Unlock() 214 215 if t.mu.status == nil { 216 t.mu.status = make(map[int64]testStatus) 217 } 218 status := t.mu.status[id] 219 status.progress = frac 220 t.mu.status[id] = status 221 } 222 223 // Progress sets the progress (a fraction in the range [0,1]) associated with 224 // the main test status messasge. When called from the main test goroutine 225 // (i.e. the goroutine on which testSpec.Run is invoked), this is equivalent to 226 // calling WorkerProgress. 227 func (t *test) Progress(frac float64) { 228 t.progress(t.runnerID, frac) 229 } 230 231 // WorkerProgress sets the progress (a fraction in the range [0,1]) associated 232 // with the a worker status messasge. 233 func (t *test) WorkerProgress(frac float64) { 234 t.progress(goid.Get(), frac) 235 } 236 237 // Skip records msg into t.spec.Skip and calls runtime.Goexit() - thus 238 // interrupting the running of the test. 239 func (t *test) Skip(msg string, details string) { 240 t.spec.Skip = msg 241 t.spec.SkipDetails = details 242 runtime.Goexit() 243 } 244 245 // Fatal marks the test as failed, prints the args to t.l, and calls 246 // runtime.GoExit(). It can be called multiple times. 247 // 248 // If the only argument is an error, it is formatted by "%+v", so it will show 249 // stack traces and such. 250 // 251 // ATTENTION: Since this calls runtime.GoExit(), it should only be called from a 252 // test's closure. The test runner itself should never call this. 253 func (t *test) Fatal(args ...interface{}) { 254 t.fatalfInner("" /* format */, args...) 255 } 256 257 // Fatalf is like Fatal, but takes a format string. 258 func (t *test) Fatalf(format string, args ...interface{}) { 259 t.fatalfInner(format, args...) 260 } 261 262 // FailNow implements the TestingT interface. 263 func (t *test) FailNow() { 264 t.Fatal() 265 } 266 267 // Errorf implements the TestingT interface. 268 func (t *test) Errorf(format string, args ...interface{}) { 269 t.Fatalf(format, args...) 270 } 271 272 func (t *test) fatalfInner(format string, args ...interface{}) { 273 // Skip two frames: our own and the caller. 274 if format != "" { 275 t.printfAndFail(2 /* skip */, format, args...) 276 } else { 277 t.printAndFail(2 /* skip */, args...) 278 } 279 runtime.Goexit() 280 } 281 282 // FatalIfErr calls t.Fatal() if err != nil. 283 func FatalIfErr(t *test, err error) { 284 if err != nil { 285 t.fatalfInner("" /* format */, err) 286 } 287 } 288 289 func (t *test) printAndFail(skip int, args ...interface{}) { 290 var msg string 291 if len(args) == 1 { 292 // If we were passed only an error, then format it with "%+v" in order to 293 // get any stack traces. 294 if err, ok := args[0].(error); ok { 295 msg = fmt.Sprintf("%+v", err) 296 } 297 } 298 if msg == "" { 299 msg = fmt.Sprint(args...) 300 } 301 t.failWithMsg(t.decorate(skip+1, msg)) 302 } 303 304 func (t *test) printfAndFail(skip int, format string, args ...interface{}) { 305 if format == "" { 306 panic(fmt.Sprintf("invalid empty format. args: %s", args)) 307 } 308 t.failWithMsg(t.decorate(skip+1, fmt.Sprintf(format, args...))) 309 } 310 311 func (t *test) failWithMsg(msg string) { 312 t.mu.Lock() 313 defer t.mu.Unlock() 314 315 prefix := "" 316 if t.mu.failed { 317 prefix = "[not the first failure] " 318 // NB: the first failure is not always the relevant one due to: 319 // https://github.com/cockroachdb/cockroach/issues/44436 320 // 321 // So we chain all failures together in the order in which we see 322 // them. 323 msg = "\n" + msg 324 } 325 t.l.Printf("%stest failure: %s", prefix, msg) 326 327 t.mu.failed = true 328 t.mu.failureMsg += msg 329 t.mu.output = append(t.mu.output, msg...) 330 if t.mu.cancel != nil { 331 t.mu.cancel() 332 } 333 } 334 335 // Args: 336 // skip: The number of stack frames to exclude from the result. 0 means that 337 // the caller will be the first frame identified. 1 means the caller's caller 338 // will be the first, etc. 339 func (t *test) decorate(skip int, s string) string { 340 // Skip two extra frames to account for this function and runtime.Callers 341 // itself. 342 var pc [50]uintptr 343 n := runtime.Callers(2+skip, pc[:]) 344 if n == 0 { 345 panic("zero callers found") 346 } 347 348 buf := new(bytes.Buffer) 349 frames := runtime.CallersFrames(pc[:n]) 350 sep := "\t" 351 runnerFound := false 352 for { 353 if runnerFound { 354 break 355 } 356 357 frame, more := frames.Next() 358 if !more { 359 break 360 } 361 if frame.Function == t.runner { 362 runnerFound = true 363 364 // Handle the special case of the runner function being the caller of 365 // t.Fatal(). In that case, that's the line to be used for issue creation. 366 if t.mu.failLoc.file == "" { 367 t.mu.failLoc.file = frame.File 368 t.mu.failLoc.line = frame.Line 369 } 370 } 371 if !t.mu.failed && !runnerFound { 372 // Keep track of the highest stack frame that is lower than the t.runner 373 // stack frame. This is used to determine the author of that line of code 374 // and issue assignment. 375 t.mu.failLoc.file = frame.File 376 t.mu.failLoc.line = frame.Line 377 } 378 file := frame.File 379 if index := strings.LastIndexByte(file, '/'); index >= 0 { 380 file = file[index+1:] 381 } 382 fmt.Fprintf(buf, "%s%s:%d", sep, file, frame.Line) 383 sep = "," 384 } 385 buf.WriteString(": ") 386 387 lines := strings.Split(s, "\n") 388 if l := len(lines); l > 1 && lines[l-1] == "" { 389 lines = lines[:l-1] 390 } 391 for i, line := range lines { 392 if i > 0 { 393 buf.WriteString("\n\t\t") 394 } 395 buf.WriteString(line) 396 } 397 buf.WriteByte('\n') 398 return buf.String() 399 } 400 401 func (t *test) duration() time.Duration { 402 return t.end.Sub(t.start) 403 } 404 405 func (t *test) Failed() bool { 406 t.mu.RLock() 407 defer t.mu.RUnlock() 408 return t.mu.failed 409 } 410 411 func (t *test) FailureMsg() string { 412 t.mu.RLock() 413 defer t.mu.RUnlock() 414 return t.mu.failureMsg 415 } 416 417 func (t *test) ArtifactsDir() string { 418 return t.artifactsDir 419 } 420 421 // IsBuildVersion returns true if the build version is greater than or equal to 422 // minVersion. This allows a test to optionally perform additional checks 423 // depending on the cockroach version it is running against. Note that the 424 // versions are Cockroach build tag version numbers, not the internal cluster 425 // version number. 426 func (t *test) IsBuildVersion(minVersion string) bool { 427 vers, err := version.Parse(minVersion) 428 if err != nil { 429 t.Fatal(err) 430 } 431 if p := vers.PreRelease(); p != "" { 432 panic("cannot specify a prerelease: " + p) 433 } 434 // We append "-0" to the min-version spec so that we capture all 435 // prereleases of the specified version. Otherwise, "v2.1.0" would compare 436 // greater than "v2.1.0-alpha.x". 437 vers = version.MustParse(minVersion + "-0") 438 return t.buildVersion.AtLeast(vers) 439 } 440 441 // teamCityEscape escapes a string for use as <value> in a key='<value>' attribute 442 // in TeamCity build output marker. 443 // Documentation here: https://confluence.jetbrains.com/display/TCD10/Build+Script+Interaction+with+TeamCity#BuildScriptInteractionwithTeamCity-Escapedvalues 444 func teamCityEscape(s string) string { 445 r := strings.NewReplacer( 446 "\n", "|n", 447 "'", "|'", 448 "|", "||", 449 "[", "|[", 450 "]", "|]", 451 ) 452 return r.Replace(s) 453 } 454 455 func teamCityNameEscape(name string) string { 456 return strings.Replace(name, ",", "_", -1) 457 } 458 459 // getAuthorEmail retrieves the author of a line of code. Returns the empty 460 // string if the author cannot be determined. Some test tags override this 461 // behavior and have a hardcoded author email. 462 func getAuthorEmail(tags []string, file string, line int) string { 463 for _, tag := range tags { 464 if tag == `orm` || tag == `driver` { 465 return `rafi@cockroachlabs.com` 466 } 467 } 468 const repo = "github.com/cockroachdb/cockroach/" 469 i := strings.Index(file, repo) 470 if i == -1 { 471 return "" 472 } 473 file = file[i+len(repo):] 474 475 cmd := exec.Command(`/bin/bash`, `-c`, 476 fmt.Sprintf(`git blame --porcelain -L%d,+1 $(git rev-parse --show-toplevel)/%s | grep author-mail`, 477 line, file)) 478 // This command returns output such as: 479 // author-mail <jordan@cockroachlabs.com> 480 out, err := cmd.CombinedOutput() 481 if err != nil { 482 return "" 483 } 484 re := regexp.MustCompile("author-mail <(.*)>") 485 matches := re.FindSubmatch(out) 486 if matches == nil { 487 return "" 488 } 489 return string(matches[1]) 490 } 491 492 type testWithCount struct { 493 spec testSpec 494 // count maintains the number of runs remaining for a test. 495 count int 496 } 497 498 type clusterType int 499 500 const ( 501 localCluster clusterType = iota 502 roachprodCluster 503 ) 504 505 type loggingOpt struct { 506 // l is the test runner logger. 507 // Note that individual test runs will use a different logger. 508 l *logger 509 // tee controls whether test logs (not test runner logs) also go to stdout or 510 // not. 511 tee teeOptType 512 stdout, stderr io.Writer 513 // artifactsDir is that path to the dir that will contain the artifacts for 514 // all the tests. 515 artifactsDir string 516 // runnerLogPath is that path to the runner's log file. 517 runnerLogPath string 518 } 519 520 type workerStatus struct { 521 // name is the worker's identifier. 522 name string 523 mu struct { 524 syncutil.Mutex 525 526 // status is presented in the HTML progress page. 527 status string 528 529 ttr testToRunRes 530 t *test 531 c *cluster 532 } 533 } 534 535 func (w *workerStatus) Status() string { 536 w.mu.Lock() 537 defer w.mu.Unlock() 538 return w.mu.status 539 } 540 541 func (w *workerStatus) SetStatus(status string) { 542 w.mu.Lock() 543 w.mu.status = status 544 w.mu.Unlock() 545 } 546 547 func (w *workerStatus) Cluster() *cluster { 548 w.mu.Lock() 549 defer w.mu.Unlock() 550 return w.mu.c 551 } 552 553 func (w *workerStatus) SetCluster(c *cluster) { 554 w.mu.Lock() 555 w.mu.c = c 556 w.mu.Unlock() 557 } 558 559 func (w *workerStatus) TestToRun() testToRunRes { 560 w.mu.Lock() 561 defer w.mu.Unlock() 562 return w.mu.ttr 563 } 564 565 func (w *workerStatus) Test() *test { 566 w.mu.Lock() 567 defer w.mu.Unlock() 568 return w.mu.t 569 } 570 571 func (w *workerStatus) SetTest(t *test, ttr testToRunRes) { 572 w.mu.Lock() 573 w.mu.t = t 574 w.mu.ttr = ttr 575 w.mu.Unlock() 576 } 577 578 // shout logs a message both to a logger and to an io.Writer. 579 // If format doesn't end with a new line, one will be automatically added. 580 func shout(ctx context.Context, l *logger, stdout io.Writer, format string, args ...interface{}) { 581 if len(format) == 0 || format[len(format)-1] != '\n' { 582 format += "\n" 583 } 584 msg := fmt.Sprintf(format, args...) 585 l.PrintfCtxDepth(ctx, 2 /* depth */, msg) 586 fmt.Fprint(stdout, msg) 587 }