gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/test/gpu/cuda_test.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/test/gpu/cuda_test.go (about)

     1  // Copyright 2024 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package cuda_test tests basic CUDA workloads.
    16  package cuda_test
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"flag"
    22  	"fmt"
    23  	"math"
    24  	"os"
    25  	"runtime"
    26  	"strconv"
    27  	"strings"
    28  	"sync"
    29  	"testing"
    30  	"time"
    31  
    32  	"golang.org/x/sync/errgroup"
    33  	"gvisor.dev/gvisor/pkg/test/dockerutil"
    34  	"gvisor.dev/gvisor/pkg/test/testutil"
    35  )
    36  
    37  const (
    38  	// defaultTestTimeout is the default timeout for a single CUDA sample test.
    39  	defaultTestTimeout = 20 * time.Minute
    40  
    41  	// hangingTestTimeout is the test timeout for tests that are fast when they
    42  	// succeed, but hang forever otherwise.
    43  	hangingTestTimeout = 1 * time.Minute
    44  
    45  	// defaultContainersPerCPU is the default number of pooled containers to
    46  	// spawn for each CPU. This can be a floating-point value.
    47  	// This value was arrived at experimentally and has no particular meaning.
    48  	// Setting it too low will cause the test to take longer than necessary
    49  	// because of insufficient parallelism.
    50  	// However, setting it too high will *also* cause the test to take longer
    51  	// than necessary, because the added resource contention will cause more
    52  	// tests to fail when run in parallel with each other, forcing them to be
    53  	// re-run serialized.
    54  	defaultContainersPerCPU = 1.75
    55  
    56  	// exitCodeWaived is the EXIT_WAIVED constant used in CUDA tests.
    57  	// This exit code is typically used by CUDA tests to indicate that the
    58  	// test requires a capability or condition that is not met in the current
    59  	// test environment.
    60  	exitCodeWaived = 2
    61  )
    62  
    63  // Flags.
    64  var (
    65  	verifyCompatibility = flag.Bool("cuda_verify_compatibility", os.Getenv("GVISOR_TEST_CUDA_VERIFY_COMPATIBILITY") == "true", "whether to verify that all tests are marked as compatible")
    66  	logSuccessfulTests  = flag.Bool("cuda_log_successful_tests", false, "log console output of successful tests")
    67  	debug               = flag.Bool("cuda_test_debug", false, "log more data as the test is running")
    68  	containersPerCPU    = flag.Float64("cuda_containers_per_cpu", defaultContainersPerCPU, "number of parallel execution containers to spawn per CPU (floating point values allowed)")
    69  )
    70  
    71  // testCompatibility maps test names to their compatibility data.
    72  // Unmapped test names are assumed to be fully compatible.
    73  var testCompatibility = map[string]Compatibility{
    74  	"0_Introduction/simpleAttributes":     RequiresFeatures(FeaturePersistentL2Caching),
    75  	"0_Introduction/simpleCUDA2GL":        RequiresFeatures(FeatureGL),
    76  	"0_Introduction/simpleIPC":            &BrokenInGVisor{OnlyWhenMultipleGPU: true},
    77  	"0_Introduction/simpleP2P":            MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
    78  	"0_Introduction/UnifiedMemoryStreams": &BrokenInGVisor{},
    79  	"0_Introduction/vectorAddMMAP":        &BrokenInGVisor{OnlyWhenMultipleGPU: true},
    80  	"2_Concepts_and_Techniques/cuHook": &BrokenEverywhere{
    81  		Reason: "Requires ancient version of glibc (<=2.33)",
    82  	},
    83  	"2_Concepts_and_Techniques/EGLStream_CUDA_Interop": &BrokenEverywhere{
    84  		Reason: "Requires newer version of EGL libraries than Ubuntu has (eglCreateStreamKHR)",
    85  	},
    86  	"2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU": MultiCompatibility(
    87  		&RequiresMultiGPU{},
    88  		&BrokenEverywhere{
    89  			Reason: "Requires newer version of EGL libraries than Ubuntu has (eglCreateStreamKHR)",
    90  		},
    91  	),
    92  	"2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop":  &OnlyOnWindows{},
    93  	"2_Concepts_and_Techniques/streamOrderedAllocationIPC": &BrokenInGVisor{},
    94  	"2_Concepts_and_Techniques/streamOrderedAllocationP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
    95  	"3_CUDA_Features/bf16TensorCoreGemm":                   RequiresFeatures(FeatureTensorCores),
    96  	"3_CUDA_Features/cdpAdvancedQuicksort":                 RequiresFeatures(FeatureDynamicParallelism),
    97  	"3_CUDA_Features/cudaCompressibleMemory":               RequiresFeatures(FeatureCompressibleMemory),
    98  	"3_CUDA_Features/dmmaTensorCoreGemm":                   RequiresFeatures(FeatureTensorCores),
    99  	"3_CUDA_Features/memMapIPCDrv":                         MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
   100  	"3_CUDA_Features/tf32TensorCoreGemm":                   RequiresFeatures(FeatureTensorCores),
   101  	"4_CUDA_Libraries/conjugateGradientMultiDeviceCG":      MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
   102  	"4_CUDA_Libraries/cudaNvSci":                           &RequiresNvSci{},
   103  	"4_CUDA_Libraries/cudaNvSciNvMedia":                    &RequiresNvSci{},
   104  	"4_CUDA_Libraries/cuDLAErrorReporting":                 &OnlyOnWindows{},
   105  	"4_CUDA_Libraries/cuDLAHybridMode":                     &OnlyOnWindows{},
   106  	"4_CUDA_Libraries/cuDLAStandaloneMode":                 &OnlyOnWindows{},
   107  	"4_CUDA_Libraries/cuDLALayerwiseStatsHybrid":           &OnlyOnWindows{},
   108  	"4_CUDA_Libraries/cuDLALayerwiseStatsStandalone":       &OnlyOnWindows{},
   109  	"4_CUDA_Libraries/simpleCUFFT_2d_MGPU":                 MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
   110  	"4_CUDA_Libraries/simpleCUFFT_MGPU":                    MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
   111  	"5_Domain_Specific/fluidsD3D9":                         &OnlyOnWindows{},
   112  	"5_Domain_Specific/fluidsGL":                           RequiresFeatures(FeatureGL),
   113  	"5_Domain_Specific/fluidsGLES":                         &OnlyOnWindows{},
   114  	"5_Domain_Specific/nbody_opengles":                     &OnlyOnWindows{},
   115  	"5_Domain_Specific/nbody_screen":                       &OnlyOnWindows{},
   116  	"5_Domain_Specific/p2pBandwidthLatencyTest":            &BrokenInGVisor{OnlyWhenMultipleGPU: true},
   117  	"5_Domain_Specific/postProcessGL":                      RequiresFeatures(FeatureGL),
   118  	"5_Domain_Specific/simpleD3D10":                        &OnlyOnWindows{},
   119  	"5_Domain_Specific/simpleD3D10RenderTarget":            &OnlyOnWindows{},
   120  	"5_Domain_Specific/simpleD3D10Texture":                 &OnlyOnWindows{},
   121  	"5_Domain_Specific/simpleD3D11":                        &OnlyOnWindows{},
   122  	"5_Domain_Specific/simpleD3D11Texture":                 &OnlyOnWindows{},
   123  	"5_Domain_Specific/simpleD3D12":                        &OnlyOnWindows{},
   124  	"5_Domain_Specific/simpleD3D9":                         &OnlyOnWindows{},
   125  	"5_Domain_Specific/simpleD3D9Texture":                  &OnlyOnWindows{},
   126  	"5_Domain_Specific/simpleGLES":                         &OnlyOnWindows{},
   127  	"5_Domain_Specific/simpleGLES_EGLOutput":               &OnlyOnWindows{},
   128  	"5_Domain_Specific/simpleGLES_screen":                  &OnlyOnWindows{},
   129  	"5_Domain_Specific/simpleVulkan":                       RequiresFeatures(FeatureGL),
   130  	"5_Domain_Specific/simpleVulkanMMAP":                   RequiresFeatures(FeatureGL),
   131  	"5_Domain_Specific/SLID3D10Texture":                    &OnlyOnWindows{},
   132  	"5_Domain_Specific/VFlockingD3D10":                     &OnlyOnWindows{},
   133  	"5_Domain_Specific/vulkanImageCUDA":                    RequiresFeatures(FeatureGL),
   134  }
   135  
   136  // flakyTests is a list of tests that are flaky.
   137  // These will be retried up to 3 times in parallel before running serially.
   138  var flakyTests = map[string]struct{}{}
   139  
   140  // exclusiveTests is a list of tests that must run exclusively (i.e. with
   141  // no other test running on the machine at the same time), or they will
   142  // likely fail. These tests are not attempted to be run in parallel.
   143  // This is usually the case for performance tests or tests that use a lot
   144  // of resources in general.
   145  // This saves the trouble to run them in parallel, while also avoiding
   146  // causing spurious failures for the tests that happen to be running in
   147  // parallel with them.
   148  var exclusiveTests = map[string]struct{}{
   149  	"6_Performance/alignedTypes":      {},
   150  	"6_Performance/transpose":         {},
   151  	"6_Performance/UnifiedMemoryPerf": {},
   152  }
   153  
   154  // alwaysSkippedTests don't run at all, ever, and are not verified when
   155  // --cuda_verify_compatibility is set.
   156  // Each test is mapped to a reason why it should be skipped.
   157  var alwaysSkippedTests = map[string]string{
   158  	// These tests seem to flake in gVisor, but consistently within the same
   159  	// run of the overall test, so they cannot be included in `flakyTests`.
   160  	"0_Introduction/simpleAssert":       "Flaky in gVisor",
   161  	"0_Introduction/simpleAssert_nvrtc": "Flaky in gVisor",
   162  }
   163  
   164  // Feature is a feature as listed by /list_features.sh.
   165  type Feature string
   166  
   167  // All CUDA features listed by /list_features.sh.
   168  const (
   169  	FeaturePersistentL2Caching Feature = "PERSISTENT_L2_CACHING"
   170  	FeatureDynamicParallelism  Feature = "DYNAMIC_PARALLELISM"
   171  	FeatureGL                  Feature = "GL"
   172  	FeatureTensorCores         Feature = "TENSOR_CORES"
   173  	FeatureCompressibleMemory  Feature = "COMPRESSIBLE_MEMORY"
   174  )
   175  
   176  // allFeatures is a list of all CUDA features above.
   177  var allFeatures = []Feature{
   178  	FeaturePersistentL2Caching,
   179  	FeatureDynamicParallelism,
   180  	FeatureGL,
   181  	FeatureTensorCores,
   182  	FeatureCompressibleMemory,
   183  }
   184  
   185  // TestEnvironment represents the environment in which a sample test runs.
   186  type TestEnvironment struct {
   187  	NumGPUs         int
   188  	RuntimeIsGVisor bool
   189  	Features        map[Feature]bool
   190  }
   191  
   192  // Compatibility encodes the compatibility of a test depending on the
   193  // environment it runs in.
   194  type Compatibility interface {
   195  	// WillFail returns a string explaining why the test is expected to fail
   196  	// in the given environment, or "" if it isn't expected to fail.
   197  	WillFail(ctx context.Context, env *TestEnvironment) string
   198  
   199  	// IsExpectedFailure checks whether the `logs` (from a failed run of the test
   200  	// in the given environment) matches the failure that this test expects in
   201  	// that environment. If they match, this function should return nil.
   202  	// It is only called when `WillFail` returns a non-empty string for the same
   203  	// environment, so it may assume that `env` is non-compatible.
   204  	IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error
   205  }
   206  
   207  // BrokenEverywhere implements `Compatibility` for tests that are broken in
   208  // all environments.
   209  type BrokenEverywhere struct {
   210  	Reason string
   211  }
   212  
   213  // WillFail implements `Compatibility.WillFail`.
   214  func (be *BrokenEverywhere) WillFail(ctx context.Context, env *TestEnvironment) string {
   215  	return fmt.Sprintf("Known-broken test: %v", be.Reason)
   216  }
   217  
   218  // IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
   219  func (*BrokenEverywhere) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
   220  	return nil
   221  }
   222  
   223  // BrokenInGVisor implements `Compatibility` for tests that are broken in
   224  // gVisor only.
   225  type BrokenInGVisor struct {
   226  	// OnlyWhenMultipleGPU may be set to true for tests which only fail when
   227  	// multiple GPUs are present. This should not be used for tests that
   228  	// *require* multiple GPUs to run (use RequiresMultiGPU instead).
   229  	// This is for tests that can run on a single or multiple GPUs alike,
   230  	// but specifically fail in gVisor when run with multiple GPUs.
   231  	OnlyWhenMultipleGPU bool
   232  
   233  	// KnownToHang may be set to true for short tests which can hang instead
   234  	// of failing. This avoids waiting ~forever for them to finish.
   235  	KnownToHang bool
   236  }
   237  
   238  // WillFail implements `Compatibility.WillFail`.
   239  func (big *BrokenInGVisor) WillFail(ctx context.Context, env *TestEnvironment) string {
   240  	if !env.RuntimeIsGVisor {
   241  		return ""
   242  	}
   243  	if big.OnlyWhenMultipleGPU && env.NumGPUs == 1 {
   244  		return ""
   245  	}
   246  	if big.OnlyWhenMultipleGPU {
   247  		return "Known to be broken in gVisor when multiple GPUs are present"
   248  	}
   249  	return "Known to be broken in gVisor"
   250  }
   251  
   252  // IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
   253  func (*BrokenInGVisor) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
   254  	return nil
   255  }
   256  
   257  // RequiresMultiGPU implements `Compatibility` for tests that require multiple
   258  // GPUs.
   259  type RequiresMultiGPU struct{}
   260  
   261  // WillFail implements `Compatibility.WillFail`.
   262  func (*RequiresMultiGPU) WillFail(ctx context.Context, env *TestEnvironment) string {
   263  	if env.NumGPUs < 2 {
   264  		return "Requires >= 2 GPUs"
   265  	}
   266  	return ""
   267  }
   268  
   269  // IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
   270  func (*RequiresMultiGPU) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
   271  	if exitCode != exitCodeWaived {
   272  		return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived)
   273  	}
   274  	return nil
   275  }
   276  
   277  // requiresFeatures implements `Compatibility` for tests that require
   278  // specific features.
   279  type requiresFeatures struct {
   280  	features []Feature
   281  }
   282  
   283  func RequiresFeatures(features ...Feature) Compatibility {
   284  	return &requiresFeatures{features: features}
   285  }
   286  
   287  // WillFail implements `Compatibility.WillFail`.
   288  func (r *requiresFeatures) WillFail(ctx context.Context, env *TestEnvironment) string {
   289  	for _, feature := range r.features {
   290  		if !env.Features[feature] {
   291  			return fmt.Sprintf("Requires feature %s", feature)
   292  		}
   293  	}
   294  	return ""
   295  }
   296  
   297  // IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
   298  func (*requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
   299  	if exitCode != exitCodeWaived {
   300  		return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived)
   301  	}
   302  	return nil
   303  }
   304  
   305  // OnlyOnWindows implements `Compatibility` for tests that are only expected
   306  // to only pass on Windows.
   307  type OnlyOnWindows struct{}
   308  
   309  // WillFail implements `Compatibility.WillFail`.
   310  func (*OnlyOnWindows) WillFail(ctx context.Context, env *TestEnvironment) string {
   311  	if runtime.GOOS != "windows" {
   312  		return "Only runs on Windows"
   313  	}
   314  	return ""
   315  }
   316  
   317  // IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
   318  func (*OnlyOnWindows) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
   319  	if strings.Contains(logs, "is not supported on Linux") {
   320  		return nil
   321  	}
   322  	if exitCode != exitCodeWaived {
   323  		return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived)
   324  	}
   325  	return nil
   326  }
   327  
   328  type RequiresNvSci struct{}
   329  
   330  // WillFail implements `Compatibility.WillFail`.
   331  func (*RequiresNvSci) WillFail(ctx context.Context, env *TestEnvironment) string {
   332  	return "Requires NvSci library which is not open-source"
   333  }
   334  
   335  // IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
   336  func (*RequiresNvSci) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
   337  	return nil
   338  }
   339  
   340  // multiCompatibility implements `Compatibility` with multiple possible
   341  // Compatibility implementations.
   342  type multiCompatibility struct {
   343  	compats []Compatibility
   344  }
   345  
   346  // MultiCompatibility implements `Compatibility` with multiple possible
   347  // Compatibility implementations.
   348  func MultiCompatibility(compats ...Compatibility) Compatibility {
   349  	return &multiCompatibility{compats: compats}
   350  }
   351  
   352  // WillFail implements `Compatibility.WillFail`.
   353  func (mc *multiCompatibility) WillFail(ctx context.Context, env *TestEnvironment) string {
   354  	for _, compat := range mc.compats {
   355  		if reason := compat.WillFail(ctx, env); reason != "" {
   356  			return reason
   357  		}
   358  	}
   359  	return ""
   360  }
   361  
   362  // IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
   363  func (mc *multiCompatibility) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
   364  	var possibleCompats []Compatibility
   365  	for _, compat := range mc.compats {
   366  		if reason := compat.WillFail(ctx, env); reason != "" {
   367  			possibleCompats = append(possibleCompats, compat)
   368  		}
   369  	}
   370  	if len(possibleCompats) == 0 {
   371  		return errors.New("no known explanation for this failure")
   372  	}
   373  	var errs []string
   374  	for _, compat := range possibleCompats {
   375  		err := compat.IsExpectedFailure(ctx, env, logs, exitCode)
   376  		if err == nil {
   377  			return nil
   378  		}
   379  		errs = append(errs, fmt.Sprintf("might have been broken because %s but %v", compat.WillFail(ctx, env), err))
   380  	}
   381  	return fmt.Errorf("no known explanation for this failure: %v", strings.Join(errs, "; "))
   382  }
   383  
   384  // FullyCompatible implements `Compatibility` for tests that are expected to
   385  // pass in any environment.
   386  type FullyCompatible struct{}
   387  
   388  // WillFail implements `Compatibility.WillFail`.
   389  func (*FullyCompatible) WillFail(ctx context.Context, env *TestEnvironment) string {
   390  	return ""
   391  }
   392  
   393  // IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
   394  func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
   395  	return errors.New("test is expected to pass regardless of environment")
   396  }
   397  
   398  // getContainerOpts returns the container run options to run CUDA tests.
   399  func getContainerOpts() dockerutil.RunOpts {
   400  	opts := dockerutil.GPURunOpts()
   401  	opts.Image = "gpu/cuda-tests"
   402  	return opts
   403  }
   404  
   405  // testLog logs a line as a test log.
   406  // If debug is enabled, it is also printed immediately to stderr.
   407  // This is useful for debugging tests.
   408  func testLog(t *testing.T, format string, values ...any) {
   409  	t.Helper()
   410  	if *debug {
   411  		fmt.Fprintf(os.Stderr, "[%s] %s\n", t.Name(), fmt.Sprintf(format, values...))
   412  	}
   413  	t.Logf(format, values...)
   414  }
   415  
   416  // multiLineLog logs a multiline string as separate log messages to `t`.
   417  // This is useful to log multi-line container logs without them looking weird
   418  // with line breaks in the middle.
   419  func multiLineLog(t *testing.T, output string) {
   420  	t.Helper()
   421  	for _, line := range strings.Split(output, "\n") {
   422  		// `line` may contain % characters here, so we need to format it through
   423  		// `%s` so that `%` characters don't show up as "MISSING" in the logs.
   424  		testLog(t, "%s", line)
   425  	}
   426  }
   427  
   428  // GetEnvironment returns the environment in which a sample test runs.
   429  func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error) {
   430  	numGPU := dockerutil.NumGPU()
   431  	if numGPU == 0 {
   432  		return nil, errors.New("no GPUs detected")
   433  	}
   434  	if numGPU == 1 {
   435  		testLog(t, "1 GPU detected")
   436  	} else {
   437  		testLog(t, "%d GPUs detected", numGPU)
   438  	}
   439  	runtimeIsGVisor, err := dockerutil.IsGVisorRuntime(ctx, t)
   440  	if err != nil {
   441  		return nil, fmt.Errorf("cannot determine if runtime is gVisor or not: %w", err)
   442  	}
   443  	if runtimeIsGVisor {
   444  		testLog(t, "Runtime is detected as gVisor")
   445  	} else {
   446  		testLog(t, "Runtime is detected as not gVisor")
   447  	}
   448  	featuresContainer := dockerutil.MakeContainer(ctx, t)
   449  	defer featuresContainer.CleanUp(ctx)
   450  	featuresList, err := featuresContainer.Run(ctx, getContainerOpts(), "/list_features.sh")
   451  	if err != nil {
   452  		return nil, fmt.Errorf("cannot get list of CUDA features: %v", err)
   453  	}
   454  	features := make(map[Feature]bool)
   455  	for _, line := range strings.Split(featuresList, "\n") {
   456  		line = strings.TrimSpace(line)
   457  		if line == "" {
   458  			continue
   459  		}
   460  		featureAvailable := false
   461  		var feature Feature
   462  		if strings.HasPrefix(line, "PRESENT: ") {
   463  			featureAvailable = true
   464  			feature = Feature(strings.TrimPrefix(line, "PRESENT: "))
   465  		} else if strings.HasPrefix(line, "ABSENT: ") {
   466  			featureAvailable = false
   467  			feature = Feature(strings.TrimPrefix(line, "ABSENT: "))
   468  		} else {
   469  			return nil, fmt.Errorf("unexpected CUDA feature line: %q", line)
   470  		}
   471  		found := false
   472  		for _, f := range allFeatures {
   473  			if feature == f {
   474  				features[f] = featureAvailable
   475  				if featureAvailable {
   476  					testLog(t, "CUDA feature is available: %s", string(f))
   477  				} else {
   478  					testLog(t, "CUDA feature is *not* available: %s", string(f))
   479  				}
   480  				found = true
   481  				break
   482  			}
   483  		}
   484  		if !found {
   485  			return nil, fmt.Errorf("unknown CUDA feature: %s", string(feature))
   486  		}
   487  	}
   488  	for _, feature := range allFeatures {
   489  		if _, ok := features[feature]; !ok {
   490  			return nil, fmt.Errorf("CUDA feature not found in feature list: %s", string(feature))
   491  		}
   492  	}
   493  	// Use CUDA dynamic parallelism as a litmus test to see if the features were
   494  	// enumerated correctly.
   495  	if _, hasDynamicParallelism := features[FeatureDynamicParallelism]; !hasDynamicParallelism {
   496  		return nil, errors.New("CUDA feature Dynamic Parallelism is not available yet should be available in all environments gVisor supports; this indicates a failure in the feature listing script")
   497  	}
   498  	return &TestEnvironment{
   499  		NumGPUs:         numGPU,
   500  		RuntimeIsGVisor: runtimeIsGVisor,
   501  		Features:        features,
   502  	}, nil
   503  }
   504  
   505  // runSampleTest runs a single CUDA sample test.
   506  // It first tries to run in pooled container.
   507  // If that fails, then it runs in an exclusive container.
   508  // It returns a skip reason (or empty if the test was not skipped), and
   509  // an error if the test fails.
   510  func runSampleTest(ctx context.Context, t *testing.T, testName string, te *TestEnvironment, cp *dockerutil.ContainerPool) (string, error) {
   511  	compat, found := testCompatibility[testName]
   512  	if !found {
   513  		compat = &FullyCompatible{}
   514  	}
   515  	willFailReason := compat.WillFail(ctx, te)
   516  	if willFailReason != "" && !*verifyCompatibility {
   517  		return fmt.Sprintf("this test is expected to fail (%s) --cuda_verify_compatibility=true to verify compatibility)", willFailReason), nil
   518  	}
   519  	if skipReason, isAlwaysSkipped := alwaysSkippedTests[testName]; isAlwaysSkipped {
   520  		return fmt.Sprintf("this test is always skipped (%v)", skipReason), nil
   521  	}
   522  	testTimeout := defaultTestTimeout
   523  	execTestTimeout := testTimeout - 15*time.Second
   524  	testAttempts := 1
   525  	if _, isFlakyTest := flakyTests[testName]; isFlakyTest {
   526  		testAttempts = 3
   527  	}
   528  	parallelAttempts := testAttempts
   529  	if _, isExclusiveTest := exclusiveTests[testName]; isExclusiveTest {
   530  		parallelAttempts = 0
   531  	}
   532  	for attempt := 0; attempt < parallelAttempts; attempt++ {
   533  		c, release, err := cp.Get(ctx)
   534  		if err != nil {
   535  			release()
   536  			return "", fmt.Errorf("failed to get container: %v", err)
   537  		}
   538  		cp.SetContainerLabel(c, fmt.Sprintf("Running %s in parallel (attempt %d/%d)", testName, attempt+1, parallelAttempts))
   539  		testLog(t, "Running test in parallel mode in container %s (attempt %d/%d)...", c.Name, attempt+1, parallelAttempts)
   540  		parallelCtx, parallelCancel := context.WithTimeoutCause(ctx, testTimeout, errors.New("parallel execution took too long"))
   541  		testStartedAt := time.Now()
   542  		output, err := c.Exec(parallelCtx, dockerutil.ExecOpts{}, "/run_sample", fmt.Sprintf("--timeout=%v", execTestTimeout), testName)
   543  		testDuration := time.Since(testStartedAt)
   544  		parallelCancel()
   545  		release()
   546  		if err == nil {
   547  			if willFailReason != "" {
   548  				multiLineLog(t, output)
   549  				return "", fmt.Errorf("test unexpectedly succeeded, but we expected it to fail: %s; please update `testCompatibility`", willFailReason)
   550  			}
   551  			// Only log the output when the test succeeds here.
   552  			// If it fails, we'll run exclusively below, and the output from *that*
   553  			// run will be logged instead.
   554  			if *logSuccessfulTests {
   555  				multiLineLog(t, output)
   556  			}
   557  			testLog(t, "Test passed in parallel mode in %v.", testDuration)
   558  			return "", nil
   559  		}
   560  		var exitCode int
   561  		if execErr, ok := err.(*dockerutil.ExecError); ok {
   562  			exitCode = execErr.ExitStatus
   563  		}
   564  		if willFailReason != "" {
   565  			isExpectedErr := compat.IsExpectedFailure(ctx, te, output, exitCode)
   566  			if isExpectedErr == nil {
   567  				testLog(t, "Test failed as expected: %s (took %v)", willFailReason, testDuration)
   568  				return "", nil
   569  			}
   570  		}
   571  	}
   572  	if parallelAttempts > 0 {
   573  		testLog(t, "Will re-run the test in exclusive mode.")
   574  	}
   575  	c, release, err := cp.GetExclusive(ctx)
   576  	defer release()
   577  	if err != nil {
   578  		return "", fmt.Errorf("failed to get excusive container: %v", err)
   579  	}
   580  	var testErr error
   581  	for attempt := 0; attempt < testAttempts; attempt++ {
   582  		cp.SetContainerLabel(c, fmt.Sprintf("Running %s exclusively (attempt %d/%d)", testName, attempt+1, testAttempts))
   583  		testLog(t, "Running test in exclusive mode in container %s (attempt %d/%d)...", c.Name, attempt+1, testAttempts)
   584  		exclusiveCtx, exclusiveCancel := context.WithTimeoutCause(ctx, testTimeout, errors.New("exclusive execution took too long"))
   585  		testStartedAt := time.Now()
   586  		var output string
   587  		output, testErr = c.Exec(exclusiveCtx, dockerutil.ExecOpts{}, "/run_sample", fmt.Sprintf("--timeout=%v", execTestTimeout), testName)
   588  		testDuration := time.Since(testStartedAt)
   589  		exclusiveCancel()
   590  		if testErr == nil {
   591  			if willFailReason != "" {
   592  				multiLineLog(t, output)
   593  				return "", fmt.Errorf("test unexpectedly succeeded, but we expected it to fail: %s; please update `testCompatibility`", willFailReason)
   594  			}
   595  			if *logSuccessfulTests {
   596  				multiLineLog(t, output)
   597  			}
   598  			testLog(t, "Test passed in exclusive mode in %v.", testDuration)
   599  			return "", nil
   600  		}
   601  		multiLineLog(t, output)
   602  		var exitCode int
   603  		if execErr, ok := testErr.(*dockerutil.ExecError); ok {
   604  			exitCode = execErr.ExitStatus
   605  		}
   606  		if willFailReason != "" {
   607  			isExpectedErr := compat.IsExpectedFailure(ctx, te, output, exitCode)
   608  			if isExpectedErr == nil {
   609  				testLog(t, "Test failed as expected: %s (took %v)", willFailReason, testDuration)
   610  				return "", nil
   611  			}
   612  			return "", fmt.Errorf("test was expected to fail (%s), but it failed with %v which is a different reason reason than expected: %v", willFailReason, testErr, isExpectedErr)
   613  		}
   614  	}
   615  	return "", fmt.Errorf("test failed: %v", testErr)
   616  }
   617  
   618  // getDesiredTestParallelism returns the number of tests to run in parallel.
   619  func getDesiredTestParallelism() int {
   620  	numCPU := runtime.NumCPU()
   621  	if numCPU <= 0 {
   622  		panic("cannot detect number of cores")
   623  	}
   624  	return int(math.Ceil((*containersPerCPU) * float64(numCPU)))
   625  }
   626  
   627  // TestCUDA runs CUDA tests.
   628  func TestCUDA(t *testing.T) {
   629  	const defaultMaxDuration = 59*time.Minute + 30*time.Second
   630  
   631  	testStart := time.Now()
   632  	maxDuration := defaultMaxDuration
   633  	if timeoutFlag := flag.Lookup("timeout"); timeoutFlag != nil {
   634  		if timeoutFlagStr := timeoutFlag.Value.String(); timeoutFlagStr != "" {
   635  			timeoutFlagValue, err := time.ParseDuration(timeoutFlagStr)
   636  			if err != nil {
   637  				t.Fatalf("--timeout flag %q is not a valid duration: %v", timeoutFlagStr, err)
   638  			}
   639  			if timeoutFlagValue != 0 {
   640  				maxDuration = timeoutFlagValue
   641  			}
   642  		}
   643  	}
   644  	ctx, cancel := context.WithTimeoutCause(context.Background(), maxDuration, errors.New("overall test timed out"))
   645  	defer cancel()
   646  	testDeadline, ok := ctx.Deadline()
   647  	if !ok {
   648  		t.Fatal("context had no deadline")
   649  	}
   650  	testLog(t, "Test timeout is %v; started at %v, deadline is %v", maxDuration, testStart, testDeadline)
   651  
   652  	te, err := GetEnvironment(ctx, t)
   653  	if err != nil {
   654  		t.Fatalf("Failed to get test environment: %v", err)
   655  	}
   656  
   657  	// Get a list of sample tests.
   658  	listContainer := dockerutil.MakeContainer(ctx, t)
   659  	defer listContainer.CleanUp(ctx)
   660  	testsList, err := listContainer.Run(ctx, getContainerOpts(), "/list_sample_tests.sh")
   661  	if err != nil {
   662  		t.Fatalf("Cannot list sample tests: %v", err)
   663  	}
   664  	testsSplit := strings.Split(testsList, "\n")
   665  	allTests := make([]string, 0, len(testsSplit))
   666  	allTestsMap := make(map[string]struct{}, len(testsSplit))
   667  	for _, test := range testsSplit {
   668  		testName := strings.TrimSpace(test)
   669  		if testName == "" {
   670  			continue
   671  		}
   672  		allTestsMap[testName] = struct{}{}
   673  		allTests = append(allTests, testName)
   674  	}
   675  	numTests := len(allTests)
   676  	testLog(t, "Number of CUDA sample tests detected: %d", numTests)
   677  
   678  	// Check that all tests in test maps still exist.
   679  	t.Run("CUDA test existence", func(t *testing.T) {
   680  		for testName := range testCompatibility {
   681  			if _, ok := allTestsMap[testName]; !ok {
   682  				t.Errorf("CUDA test %q referenced in `testCompatibility` but it no longer exists, please remove it.", testName)
   683  			}
   684  		}
   685  	})
   686  
   687  	// In order to go through tests efficiently, we reuse containers.
   688  	// However, running tests serially within the same container would also be
   689  	// slow. So this test spawns a pool of containers, one per CPU.
   690  	// This saves time because a lot of the time here is actually spent waiting
   691  	// for compilation of the CUDA program on the CPU, and isn't actually
   692  	// blocked on the GPU. However, it is possible that two CUDA tests do end
   693  	// up running on the GPU at the same time, and that they don't work together
   694  	// for some reason (e.g. out of GPU memory).
   695  	// To address this, the test first runs every test in parallel. Then, if
   696  	// any of them failed, it will run only the failed ones serially.
   697  	numContainers := getDesiredTestParallelism()
   698  	testLog(t, "Number of cores is %d, spawning %.1f CUDA containers for each (%d containers total)...", runtime.NumCPU(), *containersPerCPU, numContainers)
   699  	spawnGroup, spawnCtx := errgroup.WithContext(ctx)
   700  	containers := make([]*dockerutil.Container, numContainers)
   701  	for i := 0; i < numContainers; i++ {
   702  		spawnGroup.Go(func() error {
   703  			c := dockerutil.MakeContainer(ctx, t)
   704  			if err := c.Spawn(spawnCtx, getContainerOpts(), "/bin/sleep", "6h"); err != nil {
   705  				return fmt.Errorf("container %v failed to spawn: %w", c.Name, err)
   706  			}
   707  			containers[i] = c
   708  			return nil
   709  		})
   710  	}
   711  	if err := spawnGroup.Wait(); err != nil {
   712  		for _, c := range containers {
   713  			if c != nil {
   714  				c.CleanUp(ctx)
   715  			}
   716  		}
   717  		t.Fatalf("Failed to spawn containers: %v", err)
   718  	}
   719  	cp := dockerutil.NewContainerPool(containers)
   720  	defer cp.CleanUp(ctx)
   721  	var testMu sync.Mutex
   722  	testsDone := 0
   723  	var failedTests []string
   724  	statusFn := func() {
   725  		now := time.Now()
   726  		testMu.Lock()
   727  		defer testMu.Unlock()
   728  		donePct := 100.0 * float64(testsDone) / float64(numTests)
   729  		startedAgo := now.Sub(testStart)
   730  		deadlineIn := testDeadline.Sub(now)
   731  		durationPct := 100.0 * float64(startedAgo) / float64(testDeadline.Sub(testStart))
   732  		testLog(t, "[Timing] %d/%d tests (%.1f%%) finished executing. Test started %v ago, deadline in %v (%.1f%%).", testsDone, numTests, donePct, startedAgo.Truncate(time.Second), deadlineIn.Truncate(time.Second), durationPct)
   733  		if len(failedTests) > 0 {
   734  			testLog(t, "[Failed] %d test failed: %v", len(failedTests), strings.Join(failedTests, ", "))
   735  		}
   736  		testLog(t, "[Pool] %v", cp.String())
   737  	}
   738  	if *debug {
   739  		go func() {
   740  			ticker := time.NewTicker(5 * time.Second)
   741  			defer ticker.Stop()
   742  			for {
   743  				select {
   744  				case <-ctx.Done():
   745  					return
   746  				case <-ticker.C:
   747  					statusFn()
   748  				}
   749  			}
   750  		}()
   751  	}
   752  	var samplesTestName string
   753  	t.Run("Samples", func(t *testing.T) {
   754  		samplesTestName = t.Name()
   755  		// Now spawn all subtests in parallel.
   756  		// All sub-tests will first try to run in parallel using one of the pooled
   757  		// containers.
   758  		// Those that failed will try to grab `serialMu` in order to run serially.
   759  		// Therefore, the main goroutine here holds `serialMu` and only releases
   760  		// when all parallel test attempts have completed.
   761  		testutil.NewTree(allTests, "/").RunParallel(t, func(t *testing.T, testName string) {
   762  			t.Helper()
   763  			skippedReason, err := runSampleTest(ctx, t, testName, te, cp)
   764  			if err != nil {
   765  				t.Errorf("%s: %v", testName, err)
   766  			}
   767  			testMu.Lock()
   768  			defer testMu.Unlock()
   769  			testsDone++
   770  			if t.Failed() && ctx.Err() == nil {
   771  				failedTests = append(failedTests, testName)
   772  			}
   773  			if skippedReason != "" {
   774  				t.Skip(skippedReason)
   775  			}
   776  		})
   777  	})
   778  	statusFn()
   779  	testMu.Lock()
   780  	defer testMu.Unlock()
   781  	if len(failedTests) > 0 {
   782  		if ctx.Err() != nil {
   783  			t.Errorf("%d tests failed prior to timeout:", len(failedTests))
   784  			for _, testName := range failedTests {
   785  				t.Errorf("  %s", testName)
   786  			}
   787  		}
   788  		if len(failedTests) > 0 {
   789  			t.Errorf("To re-run a specific test locally, either re-run this test with filtering enabled (example: --test.run=%s/%s), or:", samplesTestName, failedTests[0])
   790  			t.Errorf(
   791  				"  $ docker run --runtime=%s --gpus=all -e %s --rm %s /run_sample %s",
   792  				dockerutil.Runtime(),
   793  				dockerutil.AllGPUCapabilities,
   794  				getContainerOpts().Image,
   795  				failedTests[0],
   796  			)
   797  		}
   798  	} else if poolUtilization := cp.Utilization(); poolUtilization < 0.6 {
   799  		testLog(t, "WARNING: Pool utilization was only %.1f%%.", poolUtilization*100.0)
   800  		testLog(t, "This test can be made faster and more efficient with proper test categorization,")
   801  		testLog(t, "by identifying flaky tests and exclusive-requiring tests.")
   802  		testLog(t, "Consider going over the logs to identify such tests and categorize them accordingly.")
   803  	}
   804  }
   805  
   806  // TestMain overrides the `test.parallel` flag.
   807  func TestMain(m *testing.M) {
   808  	dockerutil.EnsureSupportedDockerVersion()
   809  	flag.Parse()
   810  	// The Go testing library won't run more than GOMAXPROCS parallel tests by
   811  	// default, and the value of GOMAXPROCS is taken at program initialization
   812  	// time, so by the time we get here, it is already stuck at GOMAXPROCS.
   813  	// In order to run more parallel tests than there are cores, we therefore
   814  	// need to override the `test.parallel` flag here before `m.Run`.
   815  	testParallelFlag := flag.Lookup("test.parallel")
   816  	if testParallelFlag == nil {
   817  		panic("cannot find -test.parallel flag")
   818  	}
   819  	if err := testParallelFlag.Value.Set(strconv.Itoa(getDesiredTestParallelism())); err != nil {
   820  		panic(fmt.Sprintf("cannot set -test.parallel flag: %v", err))
   821  	}
   822  	os.Exit(m.Run())
   823  }