github.com/coreos/mantle@v0.13.0/kola/harness.go (about) 1 // Copyright 2015 CoreOS, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kola 16 17 import ( 18 "encoding/json" 19 "errors" 20 "fmt" 21 "os" 22 "path/filepath" 23 "regexp" 24 "strings" 25 "time" 26 27 "github.com/coreos/go-semver/semver" 28 "github.com/coreos/pkg/capnslog" 29 30 "github.com/coreos/mantle/harness" 31 "github.com/coreos/mantle/harness/reporters" 32 "github.com/coreos/mantle/kola/cluster" 33 "github.com/coreos/mantle/kola/register" 34 "github.com/coreos/mantle/kola/torcx" 35 "github.com/coreos/mantle/platform" 36 awsapi "github.com/coreos/mantle/platform/api/aws" 37 azureapi "github.com/coreos/mantle/platform/api/azure" 38 doapi "github.com/coreos/mantle/platform/api/do" 39 esxapi "github.com/coreos/mantle/platform/api/esx" 40 gcloudapi "github.com/coreos/mantle/platform/api/gcloud" 41 openstackapi "github.com/coreos/mantle/platform/api/openstack" 42 packetapi "github.com/coreos/mantle/platform/api/packet" 43 "github.com/coreos/mantle/platform/conf" 44 "github.com/coreos/mantle/platform/machine/aws" 45 "github.com/coreos/mantle/platform/machine/azure" 46 "github.com/coreos/mantle/platform/machine/do" 47 "github.com/coreos/mantle/platform/machine/esx" 48 "github.com/coreos/mantle/platform/machine/gcloud" 49 "github.com/coreos/mantle/platform/machine/openstack" 50 "github.com/coreos/mantle/platform/machine/packet" 51 "github.com/coreos/mantle/platform/machine/qemu" 52 "github.com/coreos/mantle/platform/machine/unprivqemu" 53 "github.com/coreos/mantle/system" 54 ) 55 56 var ( 57 plog = capnslog.NewPackageLogger("github.com/coreos/mantle", "kola") 58 59 Options = platform.Options{} 60 AWSOptions = awsapi.Options{Options: &Options} // glue to set platform options from main 61 AzureOptions = azureapi.Options{Options: &Options} // glue to set platform options from main 62 DOOptions = doapi.Options{Options: &Options} // glue to set platform options from main 63 ESXOptions = esxapi.Options{Options: &Options} // glue to set platform options from main 64 GCEOptions = gcloudapi.Options{Options: &Options} // glue to set platform options from main 65 OpenStackOptions = openstackapi.Options{Options: &Options} // glue to set platform options from main 66 PacketOptions = packetapi.Options{Options: &Options} // glue to set platform options from main 67 QEMUOptions = qemu.Options{Options: &Options} // glue to set platform options from main 68 69 TestParallelism int //glue var to set test parallelism from main 70 TAPFile string // if not "", write TAP results here 71 TorcxManifestFile string // torcx manifest to expose to tests, if set 72 // TorcxManifest is the unmarshalled torcx manifest file. It is available for 73 // tests to access via `kola.TorcxManifest`. It will be nil if there was no 74 // manifest given to kola. 75 TorcxManifest *torcx.Manifest = nil 76 77 UpdatePayloadFile string 78 79 consoleChecks = []struct { 80 desc string 81 match *regexp.Regexp 82 skipFlag *register.Flag 83 }{ 84 { 85 desc: "emergency shell", 86 match: regexp.MustCompile("Press Enter for emergency shell|Starting Emergency Shell|You are in emergency mode"), 87 skipFlag: &[]register.Flag{register.NoEmergencyShellCheck}[0], 88 }, 89 { 90 desc: "kernel panic", 91 match: regexp.MustCompile("Kernel panic - not syncing: (.*)"), 92 }, 93 { 94 desc: "kernel oops", 95 match: regexp.MustCompile("Oops:"), 96 }, 97 { 98 desc: "kernel warning", 99 match: regexp.MustCompile(`WARNING: CPU: \d+ PID: \d+ at (.+)`), 100 }, 101 { 102 desc: "failure of disk under I/O", 103 match: regexp.MustCompile("rejecting I/O to offline device"), 104 }, 105 { 106 // Failure to set up Packet networking in initramfs, 107 // perhaps due to unresponsive metadata server 108 desc: "coreos-metadata failure to set up initramfs network", 109 match: regexp.MustCompile("Failed to start CoreOS Static Network Agent"), 110 }, 111 { 112 // https://github.com/coreos/bugs/issues/2065 113 desc: "excessive bonding link status messages", 114 match: regexp.MustCompile("(?s:link status up for interface [^,]+, enabling it in [0-9]+ ms.*?){10}"), 115 }, 116 { 117 // https://github.com/coreos/bugs/issues/2180 118 desc: "ext4 delayed allocation failure", 119 match: regexp.MustCompile(`EXT4-fs \([^)]+\): Delayed block allocation failed for inode \d+ at logical offset \d+ with max blocks \d+ with (error \d+)`), 120 }, 121 { 122 // https://github.com/coreos/bugs/issues/2284 123 desc: "GRUB memory corruption", 124 match: regexp.MustCompile("((alloc|free) magic) (is )?broken"), 125 }, 126 { 127 // https://github.com/coreos/bugs/issues/2435 128 desc: "Ignition fetch cancellation race", 129 match: regexp.MustCompile("ignition\\[[0-9]+\\]: failed to fetch config: context canceled"), 130 }, 131 { 132 // https://github.com/coreos/bugs/issues/2526 133 desc: "initrd-cleanup.service terminated", 134 match: regexp.MustCompile("initrd-cleanup\\.service: Main process exited, code=killed, status=15/TERM"), 135 }, 136 { 137 // kernel 4.14.11 138 desc: "bad page table", 139 match: regexp.MustCompile("mm/pgtable-generic.c:\\d+: bad (p.d|pte)"), 140 }, 141 { 142 desc: "Go panic", 143 match: regexp.MustCompile("panic: (.*)"), 144 }, 145 { 146 desc: "segfault", 147 match: regexp.MustCompile("SIGSEGV|=11/SEGV"), 148 }, 149 { 150 desc: "core dump", 151 match: regexp.MustCompile("[Cc]ore dump"), 152 }, 153 } 154 ) 155 156 // NativeRunner is a closure passed to all kola test functions and used 157 // to run native go functions directly on kola machines. It is necessary 158 // glue until kola does introspection. 159 type NativeRunner func(funcName string, m platform.Machine) error 160 161 func NewFlight(pltfrm string) (flight platform.Flight, err error) { 162 switch pltfrm { 163 case "aws": 164 flight, err = aws.NewFlight(&AWSOptions) 165 case "azure": 166 flight, err = azure.NewFlight(&AzureOptions) 167 case "do": 168 flight, err = do.NewFlight(&DOOptions) 169 case "esx": 170 flight, err = esx.NewFlight(&ESXOptions) 171 case "gce": 172 flight, err = gcloud.NewFlight(&GCEOptions) 173 case "openstack": 174 flight, err = openstack.NewFlight(&OpenStackOptions) 175 case "packet": 176 flight, err = packet.NewFlight(&PacketOptions) 177 case "qemu": 178 flight, err = qemu.NewFlight(&QEMUOptions) 179 case "qemu-unpriv": 180 flight, err = unprivqemu.NewFlight(&QEMUOptions) 181 default: 182 err = fmt.Errorf("invalid platform %q", pltfrm) 183 } 184 return 185 } 186 187 func filterTests(tests map[string]*register.Test, pattern, pltfrm string, version semver.Version) (map[string]*register.Test, error) { 188 r := make(map[string]*register.Test) 189 190 checkPlatforms := []string{pltfrm} 191 192 // qemu-unpriv has the same restrictions as QEMU but might also want additional restrictions due to the lack of a Local cluster 193 if pltfrm == "qemu-unpriv" { 194 checkPlatforms = append(checkPlatforms, "qemu") 195 } 196 197 for name, t := range tests { 198 match, err := filepath.Match(pattern, t.Name) 199 if err != nil { 200 return nil, err 201 } 202 if !match { 203 continue 204 } 205 206 // Check the test's min and end versions when running more than one test 207 if t.Name != pattern && versionOutsideRange(version, t.MinVersion, t.EndVersion) { 208 continue 209 } 210 211 existsIn := func(item string, entries []string) bool { 212 for _, i := range entries { 213 if i == item { 214 return true 215 } 216 } 217 return false 218 } 219 220 if existsIn(pltfrm, register.PlatformsNoInternet) && t.HasFlag(register.RequiresInternetAccess) { 221 plog.Debugf("skipping test %s: Internet required but not supported by platform %s", t.Name, pltfrm) 222 continue 223 } 224 225 isAllowed := func(item string, include, exclude []string) (bool, bool) { 226 allowed, excluded := true, false 227 for _, i := range include { 228 if i == item { 229 allowed = true 230 break 231 } else { 232 allowed = false 233 } 234 } 235 for _, i := range exclude { 236 if i == item { 237 allowed = false 238 excluded = true 239 } 240 } 241 return allowed, excluded 242 } 243 244 isExcluded := false 245 allowed := false 246 for _, platform := range checkPlatforms { 247 allowedPlatform, excluded := isAllowed(platform, t.Platforms, t.ExcludePlatforms) 248 if excluded { 249 isExcluded = true 250 break 251 } 252 allowedArchitecture, _ := isAllowed(architecture(platform), t.Architectures, []string{}) 253 allowed = allowed || (allowedPlatform && allowedArchitecture) 254 } 255 if isExcluded || !allowed { 256 continue 257 } 258 259 if allowed, excluded := isAllowed(Options.Distribution, t.Distros, t.ExcludeDistros); !allowed || excluded { 260 continue 261 } 262 263 r[name] = t 264 } 265 266 return r, nil 267 } 268 269 // versionOutsideRange checks to see if version is outside [min, end). If end 270 // is a zero value, it is ignored and there is no upper bound. If version is a 271 // zero value, the bounds are ignored. 272 func versionOutsideRange(version, minVersion, endVersion semver.Version) bool { 273 if version == (semver.Version{}) { 274 return false 275 } 276 277 if version.LessThan(minVersion) { 278 return true 279 } 280 281 if (endVersion != semver.Version{}) && !version.LessThan(endVersion) { 282 return true 283 } 284 285 return false 286 } 287 288 // RunTests is a harness for running multiple tests in parallel. Filters 289 // tests based on a glob pattern and by platform. Has access to all 290 // tests either registered in this package or by imported packages that 291 // register tests in their init() function. 292 // outputDir is where various test logs and data will be written for 293 // analysis after the test run. If it already exists it will be erased! 294 func RunTests(pattern, pltfrm, outputDir string) error { 295 var versionStr string 296 297 // Avoid incurring cost of starting machine in getClusterSemver when 298 // either: 299 // 1) none of the selected tests care about the version 300 // 2) glob is an exact match which means minVersion will be ignored 301 // either way 302 // 3) the provided torcx flag is wrong 303 tests, err := filterTests(register.Tests, pattern, pltfrm, semver.Version{}) 304 if err != nil { 305 plog.Fatal(err) 306 } 307 308 skipGetVersion := true 309 for name, t := range tests { 310 if name != pattern && (t.MinVersion != semver.Version{} || t.EndVersion != semver.Version{}) { 311 skipGetVersion = false 312 break 313 } 314 } 315 316 if TorcxManifestFile != "" { 317 TorcxManifest = &torcx.Manifest{} 318 torcxManifestFile, err := os.Open(TorcxManifestFile) 319 if err != nil { 320 return errors.New("Torcx manifest path provided could not be read") 321 } 322 if err := json.NewDecoder(torcxManifestFile).Decode(TorcxManifest); err != nil { 323 return fmt.Errorf("could not parse torcx manifest as valid json: %v", err) 324 } 325 torcxManifestFile.Close() 326 } 327 328 flight, err := NewFlight(pltfrm) 329 if err != nil { 330 plog.Fatalf("Flight failed: %v", err) 331 } 332 defer flight.Destroy() 333 334 if !skipGetVersion { 335 plog.Info("Creating cluster to check semver...") 336 version, err := getClusterSemver(flight, outputDir) 337 if err != nil { 338 plog.Fatal(err) 339 } 340 341 versionStr = version.String() 342 343 // one more filter pass now that we know real version 344 tests, err = filterTests(tests, pattern, pltfrm, *version) 345 if err != nil { 346 plog.Fatal(err) 347 } 348 } 349 350 opts := harness.Options{ 351 OutputDir: outputDir, 352 Parallel: TestParallelism, 353 Verbose: true, 354 Reporters: reporters.Reporters{ 355 reporters.NewJSONReporter("report.json", pltfrm, versionStr), 356 }, 357 } 358 var htests harness.Tests 359 for _, test := range tests { 360 test := test // for the closure 361 run := func(h *harness.H) { 362 runTest(h, test, pltfrm, flight) 363 } 364 htests.Add(test.Name, run) 365 } 366 367 suite := harness.NewSuite(opts, htests) 368 err = suite.Run() 369 370 if TAPFile != "" { 371 src := filepath.Join(outputDir, "test.tap") 372 if err2 := system.CopyRegularFile(src, TAPFile); err == nil && err2 != nil { 373 err = err2 374 } 375 } 376 377 if err != nil { 378 fmt.Printf("FAIL, output in %v\n", outputDir) 379 } else { 380 fmt.Printf("PASS, output in %v\n", outputDir) 381 } 382 383 return err 384 } 385 386 // getClusterSemVer returns the CoreOS semantic version via starting a 387 // machine and checking 388 func getClusterSemver(flight platform.Flight, outputDir string) (*semver.Version, error) { 389 var err error 390 391 testDir := filepath.Join(outputDir, "get_cluster_semver") 392 if err := os.MkdirAll(testDir, 0777); err != nil { 393 return nil, err 394 } 395 396 cluster, err := flight.NewCluster(&platform.RuntimeConfig{ 397 OutputDir: testDir, 398 }) 399 if err != nil { 400 return nil, fmt.Errorf("creating cluster for semver check: %v", err) 401 } 402 defer cluster.Destroy() 403 404 m, err := cluster.NewMachine(nil) 405 if err != nil { 406 return nil, fmt.Errorf("creating new machine for semver check: %v", err) 407 } 408 409 out, stderr, err := m.SSH("grep ^VERSION_ID= /etc/os-release") 410 if err != nil { 411 return nil, fmt.Errorf("parsing /etc/os-release: %v: %s", err, stderr) 412 } 413 ver := strings.Split(string(out), "=")[1] 414 415 // TODO: add distro specific version handling 416 switch Options.Distribution { 417 case "cl": 418 return parseCLVersion(ver) 419 case "rhcos": 420 return &semver.Version{}, nil 421 } 422 423 return nil, fmt.Errorf("no case to handle version parsing for distribution %q", Options.Distribution) 424 } 425 426 func parseCLVersion(input string) (*semver.Version, error) { 427 version, err := semver.NewVersion(input) 428 if err != nil { 429 return nil, fmt.Errorf("parsing os-release semver: %v", err) 430 } 431 432 return version, nil 433 } 434 435 // runTest is a harness for running a single test. 436 // outputDir is where various test logs and data will be written for 437 // analysis after the test run. It should already exist. 438 func runTest(h *harness.H, t *register.Test, pltfrm string, flight platform.Flight) { 439 h.Parallel() 440 441 rconf := &platform.RuntimeConfig{ 442 OutputDir: h.OutputDir(), 443 NoSSHKeyInUserData: t.HasFlag(register.NoSSHKeyInUserData), 444 NoSSHKeyInMetadata: t.HasFlag(register.NoSSHKeyInMetadata), 445 NoEnableSelinux: t.HasFlag(register.NoEnableSelinux), 446 } 447 c, err := flight.NewCluster(rconf) 448 if err != nil { 449 h.Fatalf("Cluster failed: %v", err) 450 } 451 defer func() { 452 c.Destroy() 453 for id, output := range c.ConsoleOutput() { 454 for _, badness := range CheckConsole([]byte(output), t) { 455 h.Errorf("Found %s on machine %s console", badness, id) 456 } 457 } 458 for id, output := range c.JournalOutput() { 459 for _, badness := range CheckConsole([]byte(output), t) { 460 h.Errorf("Found %s on machine %s journal", badness, id) 461 } 462 } 463 }() 464 465 if t.ClusterSize > 0 { 466 var userdata *conf.UserData 467 if Options.IgnitionVersion == "v2" { 468 userdata = t.UserData 469 } else if Options.IgnitionVersion == "v3" { 470 userdata = t.UserDataV3 471 } 472 if userdata != nil && userdata.Contains("$discovery") { 473 url, err := c.GetDiscoveryURL(t.ClusterSize) 474 if err != nil { 475 // Skip instead of failing since the harness not being able to 476 // get a discovery url is likely an outage (e.g 477 // 503 Service Unavailable: Back-end server is at capacity) 478 // not a problem with the OS 479 h.Skipf("Failed to create discovery endpoint: %v", err) 480 } 481 userdata = userdata.Subst("$discovery", url) 482 } 483 484 if _, err := platform.NewMachines(c, userdata, t.ClusterSize); err != nil { 485 h.Fatalf("Cluster failed starting machines: %v", err) 486 } 487 } 488 489 // pass along all registered native functions 490 var names []string 491 for k := range t.NativeFuncs { 492 names = append(names, k) 493 } 494 495 // Cluster -> TestCluster 496 tcluster := cluster.TestCluster{ 497 H: h, 498 Cluster: c, 499 NativeFuncs: names, 500 FailFast: t.FailFast, 501 } 502 503 // drop kolet binary on machines 504 if t.NativeFuncs != nil { 505 scpKolet(tcluster, architecture(pltfrm)) 506 } 507 508 defer func() { 509 // give some time for the remote journal to be flushed so it can be read 510 // before we run the deferred machine destruction 511 time.Sleep(2 * time.Second) 512 }() 513 514 // run test 515 t.Run(tcluster) 516 } 517 518 // architecture returns the machine architecture of the given platform. 519 func architecture(pltfrm string) string { 520 nativeArch := "amd64" 521 if pltfrm == "qemu" && QEMUOptions.Board != "" { 522 nativeArch = boardToArch(QEMUOptions.Board) 523 } 524 if pltfrm == "packet" && PacketOptions.Board != "" { 525 nativeArch = boardToArch(PacketOptions.Board) 526 } 527 return nativeArch 528 } 529 530 // returns the arch part of an sdk board name 531 func boardToArch(board string) string { 532 return strings.SplitN(board, "-", 2)[0] 533 } 534 535 // scpKolet searches for a kolet binary and copies it to the machine. 536 func scpKolet(c cluster.TestCluster, mArch string) { 537 for _, d := range []string{ 538 ".", 539 filepath.Dir(os.Args[0]), 540 filepath.Join(filepath.Dir(os.Args[0]), mArch), 541 filepath.Join("/usr/lib/kola", mArch), 542 } { 543 kolet := filepath.Join(d, "kolet") 544 if _, err := os.Stat(kolet); err == nil { 545 if err := c.DropFile(kolet); err != nil { 546 c.Fatalf("dropping kolet binary: %v", err) 547 } 548 // The default SELinux rules do not allow init_t to execute user_home_t 549 if Options.Distribution == "rhcos" || Options.Distribution == "fcos" { 550 for _, machine := range c.Machines() { 551 out, stderr, err := machine.SSH("sudo chcon -t bin_t kolet") 552 if err != nil { 553 c.Fatalf("running chcon on kolet: %s: %s: %v", out, stderr, err) 554 } 555 } 556 } 557 return 558 } 559 } 560 c.Fatalf("Unable to locate kolet binary for %s", mArch) 561 } 562 563 // CheckConsole checks some console output for badness and returns short 564 // descriptions of any badness it finds. If t is specified, its flags are 565 // respected. 566 func CheckConsole(output []byte, t *register.Test) []string { 567 var ret []string 568 for _, check := range consoleChecks { 569 if check.skipFlag != nil && t != nil && t.HasFlag(*check.skipFlag) { 570 continue 571 } 572 match := check.match.FindSubmatch(output) 573 if match != nil { 574 badness := check.desc 575 if len(match) > 1 { 576 // include first subexpression 577 badness += fmt.Sprintf(" (%s)", match[1]) 578 } 579 ret = append(ret, badness) 580 } 581 } 582 return ret 583 } 584 585 func SetupOutputDir(outputDir, platform string) (string, error) { 586 defaulted := outputDir == "" 587 defaultBaseDirName := "_kola_temp" 588 defaultDirName := fmt.Sprintf("%s-%s-%d", platform, time.Now().Format("2006-01-02-1504"), os.Getpid()) 589 590 if defaulted { 591 if _, err := os.Stat(defaultBaseDirName); os.IsNotExist(err) { 592 if err := os.Mkdir(defaultBaseDirName, 0777); err != nil { 593 return "", err 594 } 595 } 596 outputDir = filepath.Join(defaultBaseDirName, defaultDirName) 597 } 598 599 outputDir, err := harness.CleanOutputDir(outputDir) 600 if err != nil { 601 return "", err 602 } 603 604 if defaulted { 605 tempLinkPath := filepath.Join(outputDir, "latest") 606 linkPath := filepath.Join(defaultBaseDirName, platform+"-latest") 607 // don't clobber existing files that are not symlinks 608 st, err := os.Lstat(linkPath) 609 if err == nil && (st.Mode()&os.ModeType) != os.ModeSymlink { 610 return "", fmt.Errorf("%v exists and is not a symlink", linkPath) 611 } else if err != nil && !os.IsNotExist(err) { 612 return "", err 613 } 614 if err := os.Symlink(defaultDirName, tempLinkPath); err != nil { 615 return "", err 616 } 617 // atomic rename 618 if err := os.Rename(tempLinkPath, linkPath); err != nil { 619 os.Remove(tempLinkPath) 620 return "", err 621 } 622 } 623 624 return outputDir, nil 625 }