github.com/google/cadvisor@v0.49.1/integration/runner/runner.go (about) 1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package main 16 17 import ( 18 "bufio" 19 "bytes" 20 "encoding/json" 21 "errors" 22 "flag" 23 "fmt" 24 "io" 25 "net/http" 26 "os" 27 "os/exec" 28 "path" 29 "regexp" 30 "strconv" 31 "strings" 32 "sync" 33 "time" 34 35 "k8s.io/klog/v2" 36 37 cadvisorApi "github.com/google/cadvisor/info/v2" 38 ) 39 40 // must be able to ssh into hosts without password 41 // go run ./integration/runner/runner.go --logtostderr --v 2 --ssh-config <.ssh/config file> <list of hosts> 42 43 const ( 44 cadvisorBinary = "cadvisor" 45 testTimeout = 15 * time.Minute 46 ) 47 48 var cadvisorTimeout = flag.Duration("cadvisor_timeout", 15*time.Second, "Time to wait for cAdvisor to come up on the remote host") 49 var port = flag.Int("port", 8080, "Port in which to start cAdvisor in the remote host") 50 var testRetryCount = flag.Int("test-retry-count", 3, "Number of times to retry failed tests before failing.") 51 var testRetryWhitelist = flag.String("test-retry-whitelist", "", "Path to newline separated list of regexexp for test failures that should be retried. If empty, no tests are retried.") 52 var sshOptions = flag.String("ssh-options", "", "Commandline options passed to ssh.") 53 var retryRegex *regexp.Regexp 54 55 func getAttributes(ipAddress, portStr string) (*cadvisorApi.Attributes, error) { 56 // Get host attributes and log attributes if the tests fail. 57 var attributes cadvisorApi.Attributes 58 resp, err := http.Get(fmt.Sprintf("http://%s:%s/api/v2.1/attributes", ipAddress, portStr)) 59 if err != nil { 60 return nil, fmt.Errorf("failed to get attributes - %v", err) 61 } 62 if resp.StatusCode != http.StatusOK { 63 return nil, fmt.Errorf("failed to get attributes. Status code - %v", resp.StatusCode) 64 } 65 defer resp.Body.Close() 66 body, err := io.ReadAll(resp.Body) 67 if err != nil { 68 return nil, fmt.Errorf("unable to read attributes response body - %v", err) 69 } 70 if err := json.Unmarshal(body, &attributes); err != nil { 71 return nil, fmt.Errorf("failed to unmarshal attributes - %v", err) 72 } 73 return &attributes, nil 74 } 75 76 func RunCommand(cmd string, args ...string) error { 77 output, err := exec.Command(cmd, args...).CombinedOutput() 78 if err != nil { 79 return fmt.Errorf("command %q %q failed with error: %v and output: %s", cmd, args, err, output) 80 } 81 82 return nil 83 } 84 85 func RunSshCommand(cmd string, args ...string) error { 86 if *sshOptions != "" { 87 args = append(strings.Split(*sshOptions, " "), args...) 88 } 89 return RunCommand(cmd, args...) 90 } 91 92 func PushAndRunTests(host, testDir string) (result error) { 93 // Push binary. 94 klog.Infof("Pushing cAdvisor binary to %q...", host) 95 96 err := RunSshCommand("ssh", host, "--", "mkdir", "-p", testDir) 97 if err != nil { 98 return fmt.Errorf("failed to make remote testing directory: %v", err) 99 } 100 defer func() { 101 err = RunSshCommand("ssh", host, "--", "rm", "-rf", testDir) 102 if err != nil { 103 klog.Errorf("Failed to cleanup test directory: %v", err) 104 } 105 }() 106 107 err = RunSshCommand("scp", "-r", cadvisorBinary, fmt.Sprintf("%s:%s", host, testDir)) 108 if err != nil { 109 return fmt.Errorf("failed to copy binary: %v", err) 110 } 111 112 // Start cAdvisor. 113 klog.Infof("Running cAdvisor on %q...", host) 114 portStr := strconv.Itoa(*port) 115 errChan := make(chan error, 1) 116 go func() { 117 err = RunSshCommand("ssh", host, "--", fmt.Sprintf("sudo GORACE='halt_on_error=1' %s --port %s --logtostderr --env_metadata_whitelist=TEST_VAR &> %s/log.txt", path.Join(testDir, cadvisorBinary), portStr, testDir)) 118 if err != nil { 119 errChan <- fmt.Errorf("error running cAdvisor: %v", err) 120 } 121 }() 122 defer func() { 123 err = RunSshCommand("ssh", host, "--", "sudo", "pkill", cadvisorBinary) 124 if err != nil { 125 klog.Errorf("Failed to cleanup: %v", err) 126 } 127 }() 128 defer func() { 129 if result != nil { 130 // Copy logs from the host 131 err := RunSshCommand("scp", fmt.Sprintf("%s:%s/log.txt", host, testDir), "./") 132 if err != nil { 133 result = fmt.Errorf("error fetching logs: %v for %v", err, result) 134 return 135 } 136 defer os.Remove("./log.txt") 137 logs, err := os.ReadFile("./log.txt") 138 if err != nil { 139 result = fmt.Errorf("error reading local log file: %v for %v", err, result) 140 return 141 } 142 klog.Errorf("----------------------\nLogs from Host: %q\n%v\n", host, string(logs)) 143 144 // Get attributes for debugging purposes. 145 attributes, err := getAttributes(host, portStr) 146 if err != nil { 147 klog.Errorf("Failed to read host attributes: %v", err) 148 } 149 result = fmt.Errorf("error on host %s: %v\n%+v", host, result, attributes) 150 } 151 }() 152 153 // Wait for cAdvisor to come up. 154 endTime := time.Now().Add(*cadvisorTimeout) 155 done := false 156 for endTime.After(time.Now()) && !done { 157 select { 158 case err := <-errChan: 159 // Quit early if there was an error. 160 return err 161 case <-time.After(500 * time.Millisecond): 162 // Stop waiting when cAdvisor is healthy.. 163 resp, err := http.Get(fmt.Sprintf("http://%s:%s/healthz", host, portStr)) 164 if err == nil && resp.StatusCode == http.StatusOK { 165 done = true 166 } 167 } 168 } 169 if !done { 170 return fmt.Errorf("timed out waiting for cAdvisor to come up at host %q", host) 171 } 172 173 // Run the tests in a retry loop. 174 klog.Infof("Running integration tests targeting %q...", host) 175 for i := 0; i <= *testRetryCount; i++ { 176 // Check if this is a retry 177 if i > 0 { 178 time.Sleep(time.Second * 15) // Wait 15 seconds before retrying 179 klog.Warningf("Retrying (%d of %d) tests on host %s due to error %v", i, *testRetryCount, host, err) 180 } 181 // Run the command 182 183 err = RunCommand("go", "test", "--timeout", testTimeout.String(), "github.com/google/cadvisor/integration/tests/...", "--host", host, "--port", portStr, "--ssh-options", *sshOptions) 184 if err == nil { 185 // On success, break out of retry loop 186 break 187 } 188 189 // Only retry on test failures caused by these known flaky failure conditions 190 if retryRegex == nil || !retryRegex.Match([]byte(err.Error())) { 191 klog.Warningf("Skipping retry for tests on host %s because error is not whitelisted", host) 192 break 193 } 194 } 195 return err 196 } 197 198 func Run() error { 199 start := time.Now() 200 defer func() { 201 klog.Infof("Execution time %v", time.Since(start)) 202 }() 203 defer klog.Flush() 204 205 hosts := flag.Args() 206 testDir := fmt.Sprintf("/tmp/cadvisor-%d", os.Getpid()) 207 klog.Infof("Running integration tests on host(s) %q", strings.Join(hosts, ",")) 208 209 // Build cAdvisor. 210 klog.Infof("Building cAdvisor...") 211 err := RunCommand("build/build.sh") 212 if err != nil { 213 return err 214 } 215 defer func() { 216 err := RunCommand("rm", cadvisorBinary) 217 if err != nil { 218 klog.Error(err) 219 } 220 }() 221 222 // Run test on all hosts in parallel. 223 var wg sync.WaitGroup 224 allErrors := make([]error, 0) 225 var allErrorsLock sync.Mutex 226 for _, host := range hosts { 227 wg.Add(1) 228 go func(host string) { 229 defer wg.Done() 230 err := PushAndRunTests(host, testDir) 231 if err != nil { 232 func() { 233 allErrorsLock.Lock() 234 defer allErrorsLock.Unlock() 235 allErrors = append(allErrors, err) 236 }() 237 } 238 }(host) 239 } 240 wg.Wait() 241 242 if len(allErrors) != 0 { 243 var buffer bytes.Buffer 244 for i, err := range allErrors { 245 buffer.WriteString(fmt.Sprintf("Error %d: ", i)) 246 buffer.WriteString(err.Error()) 247 buffer.WriteString("\n") 248 } 249 return errors.New(buffer.String()) 250 } 251 252 klog.Infof("All tests pass!") 253 return nil 254 } 255 256 // initRetryWhitelist initializes the whitelist of test failures that can be retried. 257 func initRetryWhitelist() { 258 if *testRetryWhitelist == "" { 259 return 260 } 261 262 file, err := os.Open(*testRetryWhitelist) 263 if err != nil { 264 klog.Fatal(err) 265 } 266 defer file.Close() 267 268 retryStrings := []string{} 269 scanner := bufio.NewScanner(file) 270 for scanner.Scan() { 271 text := scanner.Text() 272 if text != "" { 273 retryStrings = append(retryStrings, text) 274 } 275 } 276 if err := scanner.Err(); err != nil { 277 klog.Fatal(err) 278 } 279 retryRegex = regexp.MustCompile(strings.Join(retryStrings, "|")) 280 } 281 282 func main() { 283 klog.InitFlags(nil) 284 flag.Parse() 285 286 // Check usage. 287 if len(flag.Args()) == 0 { 288 klog.Fatalf("USAGE: runner <hosts to test>") 289 } 290 initRetryWhitelist() 291 292 // Run the tests. 293 err := Run() 294 if err != nil { 295 klog.Fatal(err) 296 } 297 }