gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/test/gpu/cuda_test.go (about) 1 // Copyright 2024 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package cuda_test tests basic CUDA workloads. 16 package cuda_test 17 18 import ( 19 "context" 20 "errors" 21 "flag" 22 "fmt" 23 "math" 24 "os" 25 "runtime" 26 "strconv" 27 "strings" 28 "sync" 29 "testing" 30 "time" 31 32 "golang.org/x/sync/errgroup" 33 "gvisor.dev/gvisor/pkg/test/dockerutil" 34 "gvisor.dev/gvisor/pkg/test/testutil" 35 ) 36 37 const ( 38 // defaultTestTimeout is the default timeout for a single CUDA sample test. 39 defaultTestTimeout = 20 * time.Minute 40 41 // hangingTestTimeout is the test timeout for tests that are fast when they 42 // succeed, but hang forever otherwise. 43 hangingTestTimeout = 1 * time.Minute 44 45 // defaultContainersPerCPU is the default number of pooled containers to 46 // spawn for each CPU. This can be a floating-point value. 47 // This value was arrived at experimentally and has no particular meaning. 48 // Setting it too low will cause the test to take longer than necessary 49 // because of insufficient parallelism. 50 // However, setting it too high will *also* cause the test to take longer 51 // than necessary, because the added resource contention will cause more 52 // tests to fail when run in parallel with each other, forcing them to be 53 // re-run serialized. 54 defaultContainersPerCPU = 1.75 55 56 // exitCodeWaived is the EXIT_WAIVED constant used in CUDA tests. 57 // This exit code is typically used by CUDA tests to indicate that the 58 // test requires a capability or condition that is not met in the current 59 // test environment. 60 exitCodeWaived = 2 61 ) 62 63 // Flags. 64 var ( 65 verifyCompatibility = flag.Bool("cuda_verify_compatibility", os.Getenv("GVISOR_TEST_CUDA_VERIFY_COMPATIBILITY") == "true", "whether to verify that all tests are marked as compatible") 66 logSuccessfulTests = flag.Bool("cuda_log_successful_tests", false, "log console output of successful tests") 67 debug = flag.Bool("cuda_test_debug", false, "log more data as the test is running") 68 containersPerCPU = flag.Float64("cuda_containers_per_cpu", defaultContainersPerCPU, "number of parallel execution containers to spawn per CPU (floating point values allowed)") 69 ) 70 71 // testCompatibility maps test names to their compatibility data. 72 // Unmapped test names are assumed to be fully compatible. 73 var testCompatibility = map[string]Compatibility{ 74 "0_Introduction/simpleAttributes": RequiresFeatures(FeaturePersistentL2Caching), 75 "0_Introduction/simpleCUDA2GL": RequiresFeatures(FeatureGL), 76 "0_Introduction/simpleIPC": &BrokenInGVisor{OnlyWhenMultipleGPU: true}, 77 "0_Introduction/simpleP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), 78 "0_Introduction/UnifiedMemoryStreams": &BrokenInGVisor{}, 79 "0_Introduction/vectorAddMMAP": &BrokenInGVisor{OnlyWhenMultipleGPU: true}, 80 "2_Concepts_and_Techniques/cuHook": &BrokenEverywhere{ 81 Reason: "Requires ancient version of glibc (<=2.33)", 82 }, 83 "2_Concepts_and_Techniques/EGLStream_CUDA_Interop": &BrokenEverywhere{ 84 Reason: "Requires newer version of EGL libraries than Ubuntu has (eglCreateStreamKHR)", 85 }, 86 "2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU": MultiCompatibility( 87 &RequiresMultiGPU{}, 88 &BrokenEverywhere{ 89 Reason: "Requires newer version of EGL libraries than Ubuntu has (eglCreateStreamKHR)", 90 }, 91 ), 92 "2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop": &OnlyOnWindows{}, 93 "2_Concepts_and_Techniques/streamOrderedAllocationIPC": &BrokenInGVisor{}, 94 "2_Concepts_and_Techniques/streamOrderedAllocationP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), 95 "3_CUDA_Features/bf16TensorCoreGemm": RequiresFeatures(FeatureTensorCores), 96 "3_CUDA_Features/cdpAdvancedQuicksort": RequiresFeatures(FeatureDynamicParallelism), 97 "3_CUDA_Features/cudaCompressibleMemory": RequiresFeatures(FeatureCompressibleMemory), 98 "3_CUDA_Features/dmmaTensorCoreGemm": RequiresFeatures(FeatureTensorCores), 99 "3_CUDA_Features/memMapIPCDrv": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), 100 "3_CUDA_Features/tf32TensorCoreGemm": RequiresFeatures(FeatureTensorCores), 101 "4_CUDA_Libraries/conjugateGradientMultiDeviceCG": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), 102 "4_CUDA_Libraries/cudaNvSci": &RequiresNvSci{}, 103 "4_CUDA_Libraries/cudaNvSciNvMedia": &RequiresNvSci{}, 104 "4_CUDA_Libraries/cuDLAErrorReporting": &OnlyOnWindows{}, 105 "4_CUDA_Libraries/cuDLAHybridMode": &OnlyOnWindows{}, 106 "4_CUDA_Libraries/cuDLAStandaloneMode": &OnlyOnWindows{}, 107 "4_CUDA_Libraries/cuDLALayerwiseStatsHybrid": &OnlyOnWindows{}, 108 "4_CUDA_Libraries/cuDLALayerwiseStatsStandalone": &OnlyOnWindows{}, 109 "4_CUDA_Libraries/simpleCUFFT_2d_MGPU": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), 110 "4_CUDA_Libraries/simpleCUFFT_MGPU": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), 111 "5_Domain_Specific/fluidsD3D9": &OnlyOnWindows{}, 112 "5_Domain_Specific/fluidsGL": RequiresFeatures(FeatureGL), 113 "5_Domain_Specific/fluidsGLES": &OnlyOnWindows{}, 114 "5_Domain_Specific/nbody_opengles": &OnlyOnWindows{}, 115 "5_Domain_Specific/nbody_screen": &OnlyOnWindows{}, 116 "5_Domain_Specific/p2pBandwidthLatencyTest": &BrokenInGVisor{OnlyWhenMultipleGPU: true}, 117 "5_Domain_Specific/postProcessGL": RequiresFeatures(FeatureGL), 118 "5_Domain_Specific/simpleD3D10": &OnlyOnWindows{}, 119 "5_Domain_Specific/simpleD3D10RenderTarget": &OnlyOnWindows{}, 120 "5_Domain_Specific/simpleD3D10Texture": &OnlyOnWindows{}, 121 "5_Domain_Specific/simpleD3D11": &OnlyOnWindows{}, 122 "5_Domain_Specific/simpleD3D11Texture": &OnlyOnWindows{}, 123 "5_Domain_Specific/simpleD3D12": &OnlyOnWindows{}, 124 "5_Domain_Specific/simpleD3D9": &OnlyOnWindows{}, 125 "5_Domain_Specific/simpleD3D9Texture": &OnlyOnWindows{}, 126 "5_Domain_Specific/simpleGLES": &OnlyOnWindows{}, 127 "5_Domain_Specific/simpleGLES_EGLOutput": &OnlyOnWindows{}, 128 "5_Domain_Specific/simpleGLES_screen": &OnlyOnWindows{}, 129 "5_Domain_Specific/simpleVulkan": RequiresFeatures(FeatureGL), 130 "5_Domain_Specific/simpleVulkanMMAP": RequiresFeatures(FeatureGL), 131 "5_Domain_Specific/SLID3D10Texture": &OnlyOnWindows{}, 132 "5_Domain_Specific/VFlockingD3D10": &OnlyOnWindows{}, 133 "5_Domain_Specific/vulkanImageCUDA": RequiresFeatures(FeatureGL), 134 } 135 136 // flakyTests is a list of tests that are flaky. 137 // These will be retried up to 3 times in parallel before running serially. 138 var flakyTests = map[string]struct{}{} 139 140 // exclusiveTests is a list of tests that must run exclusively (i.e. with 141 // no other test running on the machine at the same time), or they will 142 // likely fail. These tests are not attempted to be run in parallel. 143 // This is usually the case for performance tests or tests that use a lot 144 // of resources in general. 145 // This saves the trouble to run them in parallel, while also avoiding 146 // causing spurious failures for the tests that happen to be running in 147 // parallel with them. 148 var exclusiveTests = map[string]struct{}{ 149 "6_Performance/alignedTypes": {}, 150 "6_Performance/transpose": {}, 151 "6_Performance/UnifiedMemoryPerf": {}, 152 } 153 154 // alwaysSkippedTests don't run at all, ever, and are not verified when 155 // --cuda_verify_compatibility is set. 156 // Each test is mapped to a reason why it should be skipped. 157 var alwaysSkippedTests = map[string]string{ 158 // These tests seem to flake in gVisor, but consistently within the same 159 // run of the overall test, so they cannot be included in `flakyTests`. 160 "0_Introduction/simpleAssert": "Flaky in gVisor", 161 "0_Introduction/simpleAssert_nvrtc": "Flaky in gVisor", 162 } 163 164 // Feature is a feature as listed by /list_features.sh. 165 type Feature string 166 167 // All CUDA features listed by /list_features.sh. 168 const ( 169 FeaturePersistentL2Caching Feature = "PERSISTENT_L2_CACHING" 170 FeatureDynamicParallelism Feature = "DYNAMIC_PARALLELISM" 171 FeatureGL Feature = "GL" 172 FeatureTensorCores Feature = "TENSOR_CORES" 173 FeatureCompressibleMemory Feature = "COMPRESSIBLE_MEMORY" 174 ) 175 176 // allFeatures is a list of all CUDA features above. 177 var allFeatures = []Feature{ 178 FeaturePersistentL2Caching, 179 FeatureDynamicParallelism, 180 FeatureGL, 181 FeatureTensorCores, 182 FeatureCompressibleMemory, 183 } 184 185 // TestEnvironment represents the environment in which a sample test runs. 186 type TestEnvironment struct { 187 NumGPUs int 188 RuntimeIsGVisor bool 189 Features map[Feature]bool 190 } 191 192 // Compatibility encodes the compatibility of a test depending on the 193 // environment it runs in. 194 type Compatibility interface { 195 // WillFail returns a string explaining why the test is expected to fail 196 // in the given environment, or "" if it isn't expected to fail. 197 WillFail(ctx context.Context, env *TestEnvironment) string 198 199 // IsExpectedFailure checks whether the `logs` (from a failed run of the test 200 // in the given environment) matches the failure that this test expects in 201 // that environment. If they match, this function should return nil. 202 // It is only called when `WillFail` returns a non-empty string for the same 203 // environment, so it may assume that `env` is non-compatible. 204 IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error 205 } 206 207 // BrokenEverywhere implements `Compatibility` for tests that are broken in 208 // all environments. 209 type BrokenEverywhere struct { 210 Reason string 211 } 212 213 // WillFail implements `Compatibility.WillFail`. 214 func (be *BrokenEverywhere) WillFail(ctx context.Context, env *TestEnvironment) string { 215 return fmt.Sprintf("Known-broken test: %v", be.Reason) 216 } 217 218 // IsExpectedFailure implements `Compatibility.IsExpectedFailure`. 219 func (*BrokenEverywhere) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { 220 return nil 221 } 222 223 // BrokenInGVisor implements `Compatibility` for tests that are broken in 224 // gVisor only. 225 type BrokenInGVisor struct { 226 // OnlyWhenMultipleGPU may be set to true for tests which only fail when 227 // multiple GPUs are present. This should not be used for tests that 228 // *require* multiple GPUs to run (use RequiresMultiGPU instead). 229 // This is for tests that can run on a single or multiple GPUs alike, 230 // but specifically fail in gVisor when run with multiple GPUs. 231 OnlyWhenMultipleGPU bool 232 233 // KnownToHang may be set to true for short tests which can hang instead 234 // of failing. This avoids waiting ~forever for them to finish. 235 KnownToHang bool 236 } 237 238 // WillFail implements `Compatibility.WillFail`. 239 func (big *BrokenInGVisor) WillFail(ctx context.Context, env *TestEnvironment) string { 240 if !env.RuntimeIsGVisor { 241 return "" 242 } 243 if big.OnlyWhenMultipleGPU && env.NumGPUs == 1 { 244 return "" 245 } 246 if big.OnlyWhenMultipleGPU { 247 return "Known to be broken in gVisor when multiple GPUs are present" 248 } 249 return "Known to be broken in gVisor" 250 } 251 252 // IsExpectedFailure implements `Compatibility.IsExpectedFailure`. 253 func (*BrokenInGVisor) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { 254 return nil 255 } 256 257 // RequiresMultiGPU implements `Compatibility` for tests that require multiple 258 // GPUs. 259 type RequiresMultiGPU struct{} 260 261 // WillFail implements `Compatibility.WillFail`. 262 func (*RequiresMultiGPU) WillFail(ctx context.Context, env *TestEnvironment) string { 263 if env.NumGPUs < 2 { 264 return "Requires >= 2 GPUs" 265 } 266 return "" 267 } 268 269 // IsExpectedFailure implements `Compatibility.IsExpectedFailure`. 270 func (*RequiresMultiGPU) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { 271 if exitCode != exitCodeWaived { 272 return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived) 273 } 274 return nil 275 } 276 277 // requiresFeatures implements `Compatibility` for tests that require 278 // specific features. 279 type requiresFeatures struct { 280 features []Feature 281 } 282 283 func RequiresFeatures(features ...Feature) Compatibility { 284 return &requiresFeatures{features: features} 285 } 286 287 // WillFail implements `Compatibility.WillFail`. 288 func (r *requiresFeatures) WillFail(ctx context.Context, env *TestEnvironment) string { 289 for _, feature := range r.features { 290 if !env.Features[feature] { 291 return fmt.Sprintf("Requires feature %s", feature) 292 } 293 } 294 return "" 295 } 296 297 // IsExpectedFailure implements `Compatibility.IsExpectedFailure`. 298 func (*requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { 299 if exitCode != exitCodeWaived { 300 return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived) 301 } 302 return nil 303 } 304 305 // OnlyOnWindows implements `Compatibility` for tests that are only expected 306 // to only pass on Windows. 307 type OnlyOnWindows struct{} 308 309 // WillFail implements `Compatibility.WillFail`. 310 func (*OnlyOnWindows) WillFail(ctx context.Context, env *TestEnvironment) string { 311 if runtime.GOOS != "windows" { 312 return "Only runs on Windows" 313 } 314 return "" 315 } 316 317 // IsExpectedFailure implements `Compatibility.IsExpectedFailure`. 318 func (*OnlyOnWindows) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { 319 if strings.Contains(logs, "is not supported on Linux") { 320 return nil 321 } 322 if exitCode != exitCodeWaived { 323 return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived) 324 } 325 return nil 326 } 327 328 type RequiresNvSci struct{} 329 330 // WillFail implements `Compatibility.WillFail`. 331 func (*RequiresNvSci) WillFail(ctx context.Context, env *TestEnvironment) string { 332 return "Requires NvSci library which is not open-source" 333 } 334 335 // IsExpectedFailure implements `Compatibility.IsExpectedFailure`. 336 func (*RequiresNvSci) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { 337 return nil 338 } 339 340 // multiCompatibility implements `Compatibility` with multiple possible 341 // Compatibility implementations. 342 type multiCompatibility struct { 343 compats []Compatibility 344 } 345 346 // MultiCompatibility implements `Compatibility` with multiple possible 347 // Compatibility implementations. 348 func MultiCompatibility(compats ...Compatibility) Compatibility { 349 return &multiCompatibility{compats: compats} 350 } 351 352 // WillFail implements `Compatibility.WillFail`. 353 func (mc *multiCompatibility) WillFail(ctx context.Context, env *TestEnvironment) string { 354 for _, compat := range mc.compats { 355 if reason := compat.WillFail(ctx, env); reason != "" { 356 return reason 357 } 358 } 359 return "" 360 } 361 362 // IsExpectedFailure implements `Compatibility.IsExpectedFailure`. 363 func (mc *multiCompatibility) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { 364 var possibleCompats []Compatibility 365 for _, compat := range mc.compats { 366 if reason := compat.WillFail(ctx, env); reason != "" { 367 possibleCompats = append(possibleCompats, compat) 368 } 369 } 370 if len(possibleCompats) == 0 { 371 return errors.New("no known explanation for this failure") 372 } 373 var errs []string 374 for _, compat := range possibleCompats { 375 err := compat.IsExpectedFailure(ctx, env, logs, exitCode) 376 if err == nil { 377 return nil 378 } 379 errs = append(errs, fmt.Sprintf("might have been broken because %s but %v", compat.WillFail(ctx, env), err)) 380 } 381 return fmt.Errorf("no known explanation for this failure: %v", strings.Join(errs, "; ")) 382 } 383 384 // FullyCompatible implements `Compatibility` for tests that are expected to 385 // pass in any environment. 386 type FullyCompatible struct{} 387 388 // WillFail implements `Compatibility.WillFail`. 389 func (*FullyCompatible) WillFail(ctx context.Context, env *TestEnvironment) string { 390 return "" 391 } 392 393 // IsExpectedFailure implements `Compatibility.IsExpectedFailure`. 394 func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { 395 return errors.New("test is expected to pass regardless of environment") 396 } 397 398 // getContainerOpts returns the container run options to run CUDA tests. 399 func getContainerOpts() dockerutil.RunOpts { 400 opts := dockerutil.GPURunOpts() 401 opts.Image = "gpu/cuda-tests" 402 return opts 403 } 404 405 // testLog logs a line as a test log. 406 // If debug is enabled, it is also printed immediately to stderr. 407 // This is useful for debugging tests. 408 func testLog(t *testing.T, format string, values ...any) { 409 t.Helper() 410 if *debug { 411 fmt.Fprintf(os.Stderr, "[%s] %s\n", t.Name(), fmt.Sprintf(format, values...)) 412 } 413 t.Logf(format, values...) 414 } 415 416 // multiLineLog logs a multiline string as separate log messages to `t`. 417 // This is useful to log multi-line container logs without them looking weird 418 // with line breaks in the middle. 419 func multiLineLog(t *testing.T, output string) { 420 t.Helper() 421 for _, line := range strings.Split(output, "\n") { 422 // `line` may contain % characters here, so we need to format it through 423 // `%s` so that `%` characters don't show up as "MISSING" in the logs. 424 testLog(t, "%s", line) 425 } 426 } 427 428 // GetEnvironment returns the environment in which a sample test runs. 429 func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error) { 430 numGPU := dockerutil.NumGPU() 431 if numGPU == 0 { 432 return nil, errors.New("no GPUs detected") 433 } 434 if numGPU == 1 { 435 testLog(t, "1 GPU detected") 436 } else { 437 testLog(t, "%d GPUs detected", numGPU) 438 } 439 runtimeIsGVisor, err := dockerutil.IsGVisorRuntime(ctx, t) 440 if err != nil { 441 return nil, fmt.Errorf("cannot determine if runtime is gVisor or not: %w", err) 442 } 443 if runtimeIsGVisor { 444 testLog(t, "Runtime is detected as gVisor") 445 } else { 446 testLog(t, "Runtime is detected as not gVisor") 447 } 448 featuresContainer := dockerutil.MakeContainer(ctx, t) 449 defer featuresContainer.CleanUp(ctx) 450 featuresList, err := featuresContainer.Run(ctx, getContainerOpts(), "/list_features.sh") 451 if err != nil { 452 return nil, fmt.Errorf("cannot get list of CUDA features: %v", err) 453 } 454 features := make(map[Feature]bool) 455 for _, line := range strings.Split(featuresList, "\n") { 456 line = strings.TrimSpace(line) 457 if line == "" { 458 continue 459 } 460 featureAvailable := false 461 var feature Feature 462 if strings.HasPrefix(line, "PRESENT: ") { 463 featureAvailable = true 464 feature = Feature(strings.TrimPrefix(line, "PRESENT: ")) 465 } else if strings.HasPrefix(line, "ABSENT: ") { 466 featureAvailable = false 467 feature = Feature(strings.TrimPrefix(line, "ABSENT: ")) 468 } else { 469 return nil, fmt.Errorf("unexpected CUDA feature line: %q", line) 470 } 471 found := false 472 for _, f := range allFeatures { 473 if feature == f { 474 features[f] = featureAvailable 475 if featureAvailable { 476 testLog(t, "CUDA feature is available: %s", string(f)) 477 } else { 478 testLog(t, "CUDA feature is *not* available: %s", string(f)) 479 } 480 found = true 481 break 482 } 483 } 484 if !found { 485 return nil, fmt.Errorf("unknown CUDA feature: %s", string(feature)) 486 } 487 } 488 for _, feature := range allFeatures { 489 if _, ok := features[feature]; !ok { 490 return nil, fmt.Errorf("CUDA feature not found in feature list: %s", string(feature)) 491 } 492 } 493 // Use CUDA dynamic parallelism as a litmus test to see if the features were 494 // enumerated correctly. 495 if _, hasDynamicParallelism := features[FeatureDynamicParallelism]; !hasDynamicParallelism { 496 return nil, errors.New("CUDA feature Dynamic Parallelism is not available yet should be available in all environments gVisor supports; this indicates a failure in the feature listing script") 497 } 498 return &TestEnvironment{ 499 NumGPUs: numGPU, 500 RuntimeIsGVisor: runtimeIsGVisor, 501 Features: features, 502 }, nil 503 } 504 505 // runSampleTest runs a single CUDA sample test. 506 // It first tries to run in pooled container. 507 // If that fails, then it runs in an exclusive container. 508 // It returns a skip reason (or empty if the test was not skipped), and 509 // an error if the test fails. 510 func runSampleTest(ctx context.Context, t *testing.T, testName string, te *TestEnvironment, cp *dockerutil.ContainerPool) (string, error) { 511 compat, found := testCompatibility[testName] 512 if !found { 513 compat = &FullyCompatible{} 514 } 515 willFailReason := compat.WillFail(ctx, te) 516 if willFailReason != "" && !*verifyCompatibility { 517 return fmt.Sprintf("this test is expected to fail (%s) --cuda_verify_compatibility=true to verify compatibility)", willFailReason), nil 518 } 519 if skipReason, isAlwaysSkipped := alwaysSkippedTests[testName]; isAlwaysSkipped { 520 return fmt.Sprintf("this test is always skipped (%v)", skipReason), nil 521 } 522 testTimeout := defaultTestTimeout 523 execTestTimeout := testTimeout - 15*time.Second 524 testAttempts := 1 525 if _, isFlakyTest := flakyTests[testName]; isFlakyTest { 526 testAttempts = 3 527 } 528 parallelAttempts := testAttempts 529 if _, isExclusiveTest := exclusiveTests[testName]; isExclusiveTest { 530 parallelAttempts = 0 531 } 532 for attempt := 0; attempt < parallelAttempts; attempt++ { 533 c, release, err := cp.Get(ctx) 534 if err != nil { 535 release() 536 return "", fmt.Errorf("failed to get container: %v", err) 537 } 538 cp.SetContainerLabel(c, fmt.Sprintf("Running %s in parallel (attempt %d/%d)", testName, attempt+1, parallelAttempts)) 539 testLog(t, "Running test in parallel mode in container %s (attempt %d/%d)...", c.Name, attempt+1, parallelAttempts) 540 parallelCtx, parallelCancel := context.WithTimeoutCause(ctx, testTimeout, errors.New("parallel execution took too long")) 541 testStartedAt := time.Now() 542 output, err := c.Exec(parallelCtx, dockerutil.ExecOpts{}, "/run_sample", fmt.Sprintf("--timeout=%v", execTestTimeout), testName) 543 testDuration := time.Since(testStartedAt) 544 parallelCancel() 545 release() 546 if err == nil { 547 if willFailReason != "" { 548 multiLineLog(t, output) 549 return "", fmt.Errorf("test unexpectedly succeeded, but we expected it to fail: %s; please update `testCompatibility`", willFailReason) 550 } 551 // Only log the output when the test succeeds here. 552 // If it fails, we'll run exclusively below, and the output from *that* 553 // run will be logged instead. 554 if *logSuccessfulTests { 555 multiLineLog(t, output) 556 } 557 testLog(t, "Test passed in parallel mode in %v.", testDuration) 558 return "", nil 559 } 560 var exitCode int 561 if execErr, ok := err.(*dockerutil.ExecError); ok { 562 exitCode = execErr.ExitStatus 563 } 564 if willFailReason != "" { 565 isExpectedErr := compat.IsExpectedFailure(ctx, te, output, exitCode) 566 if isExpectedErr == nil { 567 testLog(t, "Test failed as expected: %s (took %v)", willFailReason, testDuration) 568 return "", nil 569 } 570 } 571 } 572 if parallelAttempts > 0 { 573 testLog(t, "Will re-run the test in exclusive mode.") 574 } 575 c, release, err := cp.GetExclusive(ctx) 576 defer release() 577 if err != nil { 578 return "", fmt.Errorf("failed to get excusive container: %v", err) 579 } 580 var testErr error 581 for attempt := 0; attempt < testAttempts; attempt++ { 582 cp.SetContainerLabel(c, fmt.Sprintf("Running %s exclusively (attempt %d/%d)", testName, attempt+1, testAttempts)) 583 testLog(t, "Running test in exclusive mode in container %s (attempt %d/%d)...", c.Name, attempt+1, testAttempts) 584 exclusiveCtx, exclusiveCancel := context.WithTimeoutCause(ctx, testTimeout, errors.New("exclusive execution took too long")) 585 testStartedAt := time.Now() 586 var output string 587 output, testErr = c.Exec(exclusiveCtx, dockerutil.ExecOpts{}, "/run_sample", fmt.Sprintf("--timeout=%v", execTestTimeout), testName) 588 testDuration := time.Since(testStartedAt) 589 exclusiveCancel() 590 if testErr == nil { 591 if willFailReason != "" { 592 multiLineLog(t, output) 593 return "", fmt.Errorf("test unexpectedly succeeded, but we expected it to fail: %s; please update `testCompatibility`", willFailReason) 594 } 595 if *logSuccessfulTests { 596 multiLineLog(t, output) 597 } 598 testLog(t, "Test passed in exclusive mode in %v.", testDuration) 599 return "", nil 600 } 601 multiLineLog(t, output) 602 var exitCode int 603 if execErr, ok := testErr.(*dockerutil.ExecError); ok { 604 exitCode = execErr.ExitStatus 605 } 606 if willFailReason != "" { 607 isExpectedErr := compat.IsExpectedFailure(ctx, te, output, exitCode) 608 if isExpectedErr == nil { 609 testLog(t, "Test failed as expected: %s (took %v)", willFailReason, testDuration) 610 return "", nil 611 } 612 return "", fmt.Errorf("test was expected to fail (%s), but it failed with %v which is a different reason reason than expected: %v", willFailReason, testErr, isExpectedErr) 613 } 614 } 615 return "", fmt.Errorf("test failed: %v", testErr) 616 } 617 618 // getDesiredTestParallelism returns the number of tests to run in parallel. 619 func getDesiredTestParallelism() int { 620 numCPU := runtime.NumCPU() 621 if numCPU <= 0 { 622 panic("cannot detect number of cores") 623 } 624 return int(math.Ceil((*containersPerCPU) * float64(numCPU))) 625 } 626 627 // TestCUDA runs CUDA tests. 628 func TestCUDA(t *testing.T) { 629 const defaultMaxDuration = 59*time.Minute + 30*time.Second 630 631 testStart := time.Now() 632 maxDuration := defaultMaxDuration 633 if timeoutFlag := flag.Lookup("timeout"); timeoutFlag != nil { 634 if timeoutFlagStr := timeoutFlag.Value.String(); timeoutFlagStr != "" { 635 timeoutFlagValue, err := time.ParseDuration(timeoutFlagStr) 636 if err != nil { 637 t.Fatalf("--timeout flag %q is not a valid duration: %v", timeoutFlagStr, err) 638 } 639 if timeoutFlagValue != 0 { 640 maxDuration = timeoutFlagValue 641 } 642 } 643 } 644 ctx, cancel := context.WithTimeoutCause(context.Background(), maxDuration, errors.New("overall test timed out")) 645 defer cancel() 646 testDeadline, ok := ctx.Deadline() 647 if !ok { 648 t.Fatal("context had no deadline") 649 } 650 testLog(t, "Test timeout is %v; started at %v, deadline is %v", maxDuration, testStart, testDeadline) 651 652 te, err := GetEnvironment(ctx, t) 653 if err != nil { 654 t.Fatalf("Failed to get test environment: %v", err) 655 } 656 657 // Get a list of sample tests. 658 listContainer := dockerutil.MakeContainer(ctx, t) 659 defer listContainer.CleanUp(ctx) 660 testsList, err := listContainer.Run(ctx, getContainerOpts(), "/list_sample_tests.sh") 661 if err != nil { 662 t.Fatalf("Cannot list sample tests: %v", err) 663 } 664 testsSplit := strings.Split(testsList, "\n") 665 allTests := make([]string, 0, len(testsSplit)) 666 allTestsMap := make(map[string]struct{}, len(testsSplit)) 667 for _, test := range testsSplit { 668 testName := strings.TrimSpace(test) 669 if testName == "" { 670 continue 671 } 672 allTestsMap[testName] = struct{}{} 673 allTests = append(allTests, testName) 674 } 675 numTests := len(allTests) 676 testLog(t, "Number of CUDA sample tests detected: %d", numTests) 677 678 // Check that all tests in test maps still exist. 679 t.Run("CUDA test existence", func(t *testing.T) { 680 for testName := range testCompatibility { 681 if _, ok := allTestsMap[testName]; !ok { 682 t.Errorf("CUDA test %q referenced in `testCompatibility` but it no longer exists, please remove it.", testName) 683 } 684 } 685 }) 686 687 // In order to go through tests efficiently, we reuse containers. 688 // However, running tests serially within the same container would also be 689 // slow. So this test spawns a pool of containers, one per CPU. 690 // This saves time because a lot of the time here is actually spent waiting 691 // for compilation of the CUDA program on the CPU, and isn't actually 692 // blocked on the GPU. However, it is possible that two CUDA tests do end 693 // up running on the GPU at the same time, and that they don't work together 694 // for some reason (e.g. out of GPU memory). 695 // To address this, the test first runs every test in parallel. Then, if 696 // any of them failed, it will run only the failed ones serially. 697 numContainers := getDesiredTestParallelism() 698 testLog(t, "Number of cores is %d, spawning %.1f CUDA containers for each (%d containers total)...", runtime.NumCPU(), *containersPerCPU, numContainers) 699 spawnGroup, spawnCtx := errgroup.WithContext(ctx) 700 containers := make([]*dockerutil.Container, numContainers) 701 for i := 0; i < numContainers; i++ { 702 spawnGroup.Go(func() error { 703 c := dockerutil.MakeContainer(ctx, t) 704 if err := c.Spawn(spawnCtx, getContainerOpts(), "/bin/sleep", "6h"); err != nil { 705 return fmt.Errorf("container %v failed to spawn: %w", c.Name, err) 706 } 707 containers[i] = c 708 return nil 709 }) 710 } 711 if err := spawnGroup.Wait(); err != nil { 712 for _, c := range containers { 713 if c != nil { 714 c.CleanUp(ctx) 715 } 716 } 717 t.Fatalf("Failed to spawn containers: %v", err) 718 } 719 cp := dockerutil.NewContainerPool(containers) 720 defer cp.CleanUp(ctx) 721 var testMu sync.Mutex 722 testsDone := 0 723 var failedTests []string 724 statusFn := func() { 725 now := time.Now() 726 testMu.Lock() 727 defer testMu.Unlock() 728 donePct := 100.0 * float64(testsDone) / float64(numTests) 729 startedAgo := now.Sub(testStart) 730 deadlineIn := testDeadline.Sub(now) 731 durationPct := 100.0 * float64(startedAgo) / float64(testDeadline.Sub(testStart)) 732 testLog(t, "[Timing] %d/%d tests (%.1f%%) finished executing. Test started %v ago, deadline in %v (%.1f%%).", testsDone, numTests, donePct, startedAgo.Truncate(time.Second), deadlineIn.Truncate(time.Second), durationPct) 733 if len(failedTests) > 0 { 734 testLog(t, "[Failed] %d test failed: %v", len(failedTests), strings.Join(failedTests, ", ")) 735 } 736 testLog(t, "[Pool] %v", cp.String()) 737 } 738 if *debug { 739 go func() { 740 ticker := time.NewTicker(5 * time.Second) 741 defer ticker.Stop() 742 for { 743 select { 744 case <-ctx.Done(): 745 return 746 case <-ticker.C: 747 statusFn() 748 } 749 } 750 }() 751 } 752 var samplesTestName string 753 t.Run("Samples", func(t *testing.T) { 754 samplesTestName = t.Name() 755 // Now spawn all subtests in parallel. 756 // All sub-tests will first try to run in parallel using one of the pooled 757 // containers. 758 // Those that failed will try to grab `serialMu` in order to run serially. 759 // Therefore, the main goroutine here holds `serialMu` and only releases 760 // when all parallel test attempts have completed. 761 testutil.NewTree(allTests, "/").RunParallel(t, func(t *testing.T, testName string) { 762 t.Helper() 763 skippedReason, err := runSampleTest(ctx, t, testName, te, cp) 764 if err != nil { 765 t.Errorf("%s: %v", testName, err) 766 } 767 testMu.Lock() 768 defer testMu.Unlock() 769 testsDone++ 770 if t.Failed() && ctx.Err() == nil { 771 failedTests = append(failedTests, testName) 772 } 773 if skippedReason != "" { 774 t.Skip(skippedReason) 775 } 776 }) 777 }) 778 statusFn() 779 testMu.Lock() 780 defer testMu.Unlock() 781 if len(failedTests) > 0 { 782 if ctx.Err() != nil { 783 t.Errorf("%d tests failed prior to timeout:", len(failedTests)) 784 for _, testName := range failedTests { 785 t.Errorf(" %s", testName) 786 } 787 } 788 if len(failedTests) > 0 { 789 t.Errorf("To re-run a specific test locally, either re-run this test with filtering enabled (example: --test.run=%s/%s), or:", samplesTestName, failedTests[0]) 790 t.Errorf( 791 " $ docker run --runtime=%s --gpus=all -e %s --rm %s /run_sample %s", 792 dockerutil.Runtime(), 793 dockerutil.AllGPUCapabilities, 794 getContainerOpts().Image, 795 failedTests[0], 796 ) 797 } 798 } else if poolUtilization := cp.Utilization(); poolUtilization < 0.6 { 799 testLog(t, "WARNING: Pool utilization was only %.1f%%.", poolUtilization*100.0) 800 testLog(t, "This test can be made faster and more efficient with proper test categorization,") 801 testLog(t, "by identifying flaky tests and exclusive-requiring tests.") 802 testLog(t, "Consider going over the logs to identify such tests and categorize them accordingly.") 803 } 804 } 805 806 // TestMain overrides the `test.parallel` flag. 807 func TestMain(m *testing.M) { 808 dockerutil.EnsureSupportedDockerVersion() 809 flag.Parse() 810 // The Go testing library won't run more than GOMAXPROCS parallel tests by 811 // default, and the value of GOMAXPROCS is taken at program initialization 812 // time, so by the time we get here, it is already stuck at GOMAXPROCS. 813 // In order to run more parallel tests than there are cores, we therefore 814 // need to override the `test.parallel` flag here before `m.Run`. 815 testParallelFlag := flag.Lookup("test.parallel") 816 if testParallelFlag == nil { 817 panic("cannot find -test.parallel flag") 818 } 819 if err := testParallelFlag.Value.Set(strconv.Itoa(getDesiredTestParallelism())); err != nil { 820 panic(fmt.Sprintf("cannot set -test.parallel flag: %v", err)) 821 } 822 os.Exit(m.Run()) 823 }