github.com/google/cadvisor@v0.49.1/integration/runner/runner.go

github.com/google/cadvisor@v0.49.1/integration/runner/runner.go (about)

     1  // Copyright 2015 Google Inc. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package main
    16  
    17  import (
    18  	"bufio"
    19  	"bytes"
    20  	"encoding/json"
    21  	"errors"
    22  	"flag"
    23  	"fmt"
    24  	"io"
    25  	"net/http"
    26  	"os"
    27  	"os/exec"
    28  	"path"
    29  	"regexp"
    30  	"strconv"
    31  	"strings"
    32  	"sync"
    33  	"time"
    34  
    35  	"k8s.io/klog/v2"
    36  
    37  	cadvisorApi "github.com/google/cadvisor/info/v2"
    38  )
    39  
    40  // must be able to ssh into hosts without password
    41  // go run ./integration/runner/runner.go --logtostderr --v 2 --ssh-config <.ssh/config file> <list of hosts>
    42  
    43  const (
    44  	cadvisorBinary = "cadvisor"
    45  	testTimeout    = 15 * time.Minute
    46  )
    47  
    48  var cadvisorTimeout = flag.Duration("cadvisor_timeout", 15*time.Second, "Time to wait for cAdvisor to come up on the remote host")
    49  var port = flag.Int("port", 8080, "Port in which to start cAdvisor in the remote host")
    50  var testRetryCount = flag.Int("test-retry-count", 3, "Number of times to retry failed tests before failing.")
    51  var testRetryWhitelist = flag.String("test-retry-whitelist", "", "Path to newline separated list of regexexp for test failures that should be retried.  If empty, no tests are retried.")
    52  var sshOptions = flag.String("ssh-options", "", "Commandline options passed to ssh.")
    53  var retryRegex *regexp.Regexp
    54  
    55  func getAttributes(ipAddress, portStr string) (*cadvisorApi.Attributes, error) {
    56  	// Get host attributes and log attributes if the tests fail.
    57  	var attributes cadvisorApi.Attributes
    58  	resp, err := http.Get(fmt.Sprintf("http://%s:%s/api/v2.1/attributes", ipAddress, portStr))
    59  	if err != nil {
    60  		return nil, fmt.Errorf("failed to get attributes - %v", err)
    61  	}
    62  	if resp.StatusCode != http.StatusOK {
    63  		return nil, fmt.Errorf("failed to get attributes. Status code - %v", resp.StatusCode)
    64  	}
    65  	defer resp.Body.Close()
    66  	body, err := io.ReadAll(resp.Body)
    67  	if err != nil {
    68  		return nil, fmt.Errorf("unable to read attributes response body - %v", err)
    69  	}
    70  	if err := json.Unmarshal(body, &attributes); err != nil {
    71  		return nil, fmt.Errorf("failed to unmarshal attributes - %v", err)
    72  	}
    73  	return &attributes, nil
    74  }
    75  
    76  func RunCommand(cmd string, args ...string) error {
    77  	output, err := exec.Command(cmd, args...).CombinedOutput()
    78  	if err != nil {
    79  		return fmt.Errorf("command %q %q failed with error: %v and output: %s", cmd, args, err, output)
    80  	}
    81  
    82  	return nil
    83  }
    84  
    85  func RunSshCommand(cmd string, args ...string) error {
    86  	if *sshOptions != "" {
    87  		args = append(strings.Split(*sshOptions, " "), args...)
    88  	}
    89  	return RunCommand(cmd, args...)
    90  }
    91  
    92  func PushAndRunTests(host, testDir string) (result error) {
    93  	// Push binary.
    94  	klog.Infof("Pushing cAdvisor binary to %q...", host)
    95  
    96  	err := RunSshCommand("ssh", host, "--", "mkdir", "-p", testDir)
    97  	if err != nil {
    98  		return fmt.Errorf("failed to make remote testing directory: %v", err)
    99  	}
   100  	defer func() {
   101  		err = RunSshCommand("ssh", host, "--", "rm", "-rf", testDir)
   102  		if err != nil {
   103  			klog.Errorf("Failed to cleanup test directory: %v", err)
   104  		}
   105  	}()
   106  
   107  	err = RunSshCommand("scp", "-r", cadvisorBinary, fmt.Sprintf("%s:%s", host, testDir))
   108  	if err != nil {
   109  		return fmt.Errorf("failed to copy binary: %v", err)
   110  	}
   111  
   112  	// Start cAdvisor.
   113  	klog.Infof("Running cAdvisor on %q...", host)
   114  	portStr := strconv.Itoa(*port)
   115  	errChan := make(chan error, 1)
   116  	go func() {
   117  		err = RunSshCommand("ssh", host, "--", fmt.Sprintf("sudo GORACE='halt_on_error=1' %s --port %s --logtostderr --env_metadata_whitelist=TEST_VAR  &> %s/log.txt", path.Join(testDir, cadvisorBinary), portStr, testDir))
   118  		if err != nil {
   119  			errChan <- fmt.Errorf("error running cAdvisor: %v", err)
   120  		}
   121  	}()
   122  	defer func() {
   123  		err = RunSshCommand("ssh", host, "--", "sudo", "pkill", cadvisorBinary)
   124  		if err != nil {
   125  			klog.Errorf("Failed to cleanup: %v", err)
   126  		}
   127  	}()
   128  	defer func() {
   129  		if result != nil {
   130  			// Copy logs from the host
   131  			err := RunSshCommand("scp", fmt.Sprintf("%s:%s/log.txt", host, testDir), "./")
   132  			if err != nil {
   133  				result = fmt.Errorf("error fetching logs: %v for %v", err, result)
   134  				return
   135  			}
   136  			defer os.Remove("./log.txt")
   137  			logs, err := os.ReadFile("./log.txt")
   138  			if err != nil {
   139  				result = fmt.Errorf("error reading local log file: %v for %v", err, result)
   140  				return
   141  			}
   142  			klog.Errorf("----------------------\nLogs from Host: %q\n%v\n", host, string(logs))
   143  
   144  			// Get attributes for debugging purposes.
   145  			attributes, err := getAttributes(host, portStr)
   146  			if err != nil {
   147  				klog.Errorf("Failed to read host attributes: %v", err)
   148  			}
   149  			result = fmt.Errorf("error on host %s: %v\n%+v", host, result, attributes)
   150  		}
   151  	}()
   152  
   153  	// Wait for cAdvisor to come up.
   154  	endTime := time.Now().Add(*cadvisorTimeout)
   155  	done := false
   156  	for endTime.After(time.Now()) && !done {
   157  		select {
   158  		case err := <-errChan:
   159  			// Quit early if there was an error.
   160  			return err
   161  		case <-time.After(500 * time.Millisecond):
   162  			// Stop waiting when cAdvisor is healthy..
   163  			resp, err := http.Get(fmt.Sprintf("http://%s:%s/healthz", host, portStr))
   164  			if err == nil && resp.StatusCode == http.StatusOK {
   165  				done = true
   166  			}
   167  		}
   168  	}
   169  	if !done {
   170  		return fmt.Errorf("timed out waiting for cAdvisor to come up at host %q", host)
   171  	}
   172  
   173  	// Run the tests in a retry loop.
   174  	klog.Infof("Running integration tests targeting %q...", host)
   175  	for i := 0; i <= *testRetryCount; i++ {
   176  		// Check if this is a retry
   177  		if i > 0 {
   178  			time.Sleep(time.Second * 15) // Wait 15 seconds before retrying
   179  			klog.Warningf("Retrying (%d of %d) tests on host %s due to error %v", i, *testRetryCount, host, err)
   180  		}
   181  		// Run the command
   182  
   183  		err = RunCommand("go", "test", "--timeout", testTimeout.String(), "github.com/google/cadvisor/integration/tests/...", "--host", host, "--port", portStr, "--ssh-options", *sshOptions)
   184  		if err == nil {
   185  			// On success, break out of retry loop
   186  			break
   187  		}
   188  
   189  		// Only retry on test failures caused by these known flaky failure conditions
   190  		if retryRegex == nil || !retryRegex.Match([]byte(err.Error())) {
   191  			klog.Warningf("Skipping retry for tests on host %s because error is not whitelisted", host)
   192  			break
   193  		}
   194  	}
   195  	return err
   196  }
   197  
   198  func Run() error {
   199  	start := time.Now()
   200  	defer func() {
   201  		klog.Infof("Execution time %v", time.Since(start))
   202  	}()
   203  	defer klog.Flush()
   204  
   205  	hosts := flag.Args()
   206  	testDir := fmt.Sprintf("/tmp/cadvisor-%d", os.Getpid())
   207  	klog.Infof("Running integration tests on host(s) %q", strings.Join(hosts, ","))
   208  
   209  	// Build cAdvisor.
   210  	klog.Infof("Building cAdvisor...")
   211  	err := RunCommand("build/build.sh")
   212  	if err != nil {
   213  		return err
   214  	}
   215  	defer func() {
   216  		err := RunCommand("rm", cadvisorBinary)
   217  		if err != nil {
   218  			klog.Error(err)
   219  		}
   220  	}()
   221  
   222  	// Run test on all hosts in parallel.
   223  	var wg sync.WaitGroup
   224  	allErrors := make([]error, 0)
   225  	var allErrorsLock sync.Mutex
   226  	for _, host := range hosts {
   227  		wg.Add(1)
   228  		go func(host string) {
   229  			defer wg.Done()
   230  			err := PushAndRunTests(host, testDir)
   231  			if err != nil {
   232  				func() {
   233  					allErrorsLock.Lock()
   234  					defer allErrorsLock.Unlock()
   235  					allErrors = append(allErrors, err)
   236  				}()
   237  			}
   238  		}(host)
   239  	}
   240  	wg.Wait()
   241  
   242  	if len(allErrors) != 0 {
   243  		var buffer bytes.Buffer
   244  		for i, err := range allErrors {
   245  			buffer.WriteString(fmt.Sprintf("Error %d: ", i))
   246  			buffer.WriteString(err.Error())
   247  			buffer.WriteString("\n")
   248  		}
   249  		return errors.New(buffer.String())
   250  	}
   251  
   252  	klog.Infof("All tests pass!")
   253  	return nil
   254  }
   255  
   256  // initRetryWhitelist initializes the whitelist of test failures that can be retried.
   257  func initRetryWhitelist() {
   258  	if *testRetryWhitelist == "" {
   259  		return
   260  	}
   261  
   262  	file, err := os.Open(*testRetryWhitelist)
   263  	if err != nil {
   264  		klog.Fatal(err)
   265  	}
   266  	defer file.Close()
   267  
   268  	retryStrings := []string{}
   269  	scanner := bufio.NewScanner(file)
   270  	for scanner.Scan() {
   271  		text := scanner.Text()
   272  		if text != "" {
   273  			retryStrings = append(retryStrings, text)
   274  		}
   275  	}
   276  	if err := scanner.Err(); err != nil {
   277  		klog.Fatal(err)
   278  	}
   279  	retryRegex = regexp.MustCompile(strings.Join(retryStrings, "|"))
   280  }
   281  
   282  func main() {
   283  	klog.InitFlags(nil)
   284  	flag.Parse()
   285  
   286  	// Check usage.
   287  	if len(flag.Args()) == 0 {
   288  		klog.Fatalf("USAGE: runner <hosts to test>")
   289  	}
   290  	initRetryWhitelist()
   291  
   292  	// Run the tests.
   293  	err := Run()
   294  	if err != nil {
   295  		klog.Fatal(err)
   296  	}
   297  }