gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/test/runtimes/proctor/main.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Binary proctor runs the test for a particular runtime. It is meant to be
    16  // included in Docker images for all runtime tests.
    17  package main
    18  
    19  import (
    20  	"flag"
    21  	"fmt"
    22  	"log"
    23  	"os"
    24  	"strings"
    25  	"time"
    26  
    27  	"golang.org/x/sys/unix"
    28  	"gvisor.dev/gvisor/test/runtimes/proctor/lib"
    29  )
    30  
    31  var (
    32  	runtime           = flag.String("runtime", "", "name of runtime")
    33  	list              = flag.Bool("list", false, "list all available tests")
    34  	testNames         = flag.String("tests", "", "run a subset of the available tests")
    35  	pause             = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children")
    36  	timeout           = flag.Duration("timeout", 90*time.Minute, "batch timeout")
    37  	perTestTimeout    = flag.Duration("per_test_timeout", 20*time.Minute, "per-test timeout (a value of 0 disables per-test timeouts)")
    38  	runsPerTest       = flag.Int("runs_per_test", 1, "number of times to run each test (a value of 0 is the same as a value of 1, i.e. running once)")
    39  	flakyIsError      = flag.Bool("flaky_is_error", true, "if true, when running with multiple --runs_per_test, tests with inconsistent status will result in a failure status code for the batch; if false, they will be considered as passing")
    40  	flakyShortCircuit = flag.Bool("flaky_short_circuit", true, "if true, when running with multiple --runs_per_test and a test is detected as flaky, exit immediately rather than running all --runs_per_test")
    41  )
    42  
    43  // setNumFilesLimit changes the NOFILE soft rlimit if it is too high.
    44  func setNumFilesLimit() error {
    45  	// In docker containers, the default value of the NOFILE limit is
    46  	// 1048576. A few runtime tests (e.g. python:test_subprocess)
    47  	// enumerates all possible file descriptors and these tests can fail by
    48  	// timeout if the NOFILE limit is too high. On gVisor, syscalls are
    49  	// slower so these tests will need even more time to pass.
    50  	const nofile = 32768
    51  	rLimit := unix.Rlimit{}
    52  	err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rLimit)
    53  	if err != nil {
    54  		return fmt.Errorf("failed to get RLIMIT_NOFILE: %v", err)
    55  	}
    56  	if rLimit.Cur > nofile {
    57  		rLimit.Cur = nofile
    58  		err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rLimit)
    59  		if err != nil {
    60  			return fmt.Errorf("failed to set RLIMIT_NOFILE: %v", err)
    61  		}
    62  	}
    63  	return nil
    64  }
    65  
    66  func main() {
    67  	flag.Parse()
    68  
    69  	if *pause {
    70  		lib.PauseAndReap()
    71  		panic("pauseAndReap should never return")
    72  	}
    73  
    74  	if *runtime == "" {
    75  		log.Fatalf("runtime flag must be provided")
    76  	}
    77  
    78  	timer := time.NewTimer(*timeout)
    79  
    80  	tr, err := lib.TestRunnerForRuntime(*runtime)
    81  	if err != nil {
    82  		log.Fatalf("%v", err)
    83  	}
    84  
    85  	// List tests.
    86  	if *list {
    87  		tests, err := tr.ListTests()
    88  		if err != nil {
    89  			log.Fatalf("failed to list tests: %v", err)
    90  		}
    91  		for _, test := range tests {
    92  			fmt.Println(test)
    93  		}
    94  		return
    95  	}
    96  
    97  	// heartbeat
    98  	go func() {
    99  		for {
   100  			time.Sleep(15 * time.Second)
   101  			log.Println("Proctor checking in " + time.Now().String())
   102  		}
   103  	}()
   104  
   105  	var tests []string
   106  	if *testNames == "" {
   107  		// Run every test.
   108  		tests, err = tr.ListTests()
   109  		if err != nil {
   110  			log.Fatalf("failed to get all tests: %v", err)
   111  		}
   112  	} else {
   113  		// Run subset of test.
   114  		tests = strings.Split(*testNames, ",")
   115  	}
   116  
   117  	if err := setNumFilesLimit(); err != nil {
   118  		log.Fatalf("%v", err)
   119  	}
   120  
   121  	// Run tests.
   122  	cmds := tr.TestCmds(tests)
   123  	done := make(chan struct{})
   124  	defer close(done)
   125  	go func() {
   126  		select {
   127  		case <-done:
   128  			return
   129  		case <-timer.C:
   130  			log.Println("The batch timeout duration is exceeded")
   131  			killed := false
   132  			for _, cmd := range cmds {
   133  				p := cmd.Process
   134  				if p == nil || cmd.ProcessState != nil {
   135  					continue
   136  				}
   137  				pid := p.Pid
   138  				if pid > 0 {
   139  					unix.Kill(pid, unix.SIGTERM)
   140  					killed = true
   141  				}
   142  			}
   143  			if killed {
   144  				// Let tests to handle signals
   145  				time.Sleep(5 * time.Second)
   146  			}
   147  			panic("FAIL: The batch timeout duration is exceeded")
   148  		}
   149  	}()
   150  	numIterations := *runsPerTest
   151  	if numIterations == 0 {
   152  		numIterations = 1
   153  	}
   154  	for _, cmd := range cmds {
   155  		iterations := 0
   156  		successes := 0
   157  		var firstFailure error
   158  		for iteration := 1; iteration <= *runsPerTest; iteration++ {
   159  			// Make a copy of the command, as the same exec.Cmd object cannot be started multiple times.
   160  			cmdCopy := *cmd
   161  
   162  			// Handle test timeout.
   163  			testDone := make(chan struct{})
   164  			testTimedOutCh := make(chan bool, 1)
   165  			if *perTestTimeout != 0 {
   166  				go func() {
   167  					timer := time.NewTimer(*perTestTimeout)
   168  					defer timer.Stop()
   169  					select {
   170  					case <-timer.C:
   171  						testTimedOutCh <- true
   172  						cmdCopy.Process.Kill()
   173  					case <-done:
   174  						testTimedOutCh <- false
   175  					case <-testDone:
   176  						testTimedOutCh <- false
   177  					}
   178  				}()
   179  			}
   180  
   181  			// Run the test.
   182  			cmdCopy.Stdout, cmdCopy.Stderr = os.Stdout, os.Stderr
   183  			testErr := cmdCopy.Run()
   184  			close(testDone)
   185  			if <-testTimedOutCh {
   186  				testErr = fmt.Errorf("test timed out after %v", *perTestTimeout)
   187  			}
   188  
   189  			// Tally result.
   190  			iterations++
   191  			if testErr == nil {
   192  				successes++
   193  			} else if firstFailure == nil {
   194  				firstFailure = testErr
   195  			}
   196  			if *flakyShortCircuit && successes > 0 && firstFailure != nil {
   197  				break
   198  			}
   199  		}
   200  		if successes > 0 && firstFailure != nil {
   201  			// Test is flaky.
   202  			if *flakyIsError {
   203  				log.Fatalf("FLAKY: %v (%d failures out of %d)", firstFailure, iterations-successes, iterations)
   204  			} else {
   205  				log.Println(fmt.Sprintf("FLAKY: %v (%d failures out of %d)", firstFailure, iterations-successes, iterations))
   206  			}
   207  		} else if successes == 0 && firstFailure != nil {
   208  			// Test is 100% failing.
   209  			log.Fatalf("FAIL: %v", firstFailure)
   210  		} else if successes > 0 && firstFailure == nil {
   211  			// Test is 100% succeeding, do nothing.
   212  		} else {
   213  			log.Fatalf("Internal logic error")
   214  		}
   215  	}
   216  }