github.com/abayer/test-infra@v0.0.5/mungegithub/mungers/e2e/e2e.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package e2e 18 19 import ( 20 "encoding/xml" 21 "fmt" 22 "io" 23 "net/http" 24 "strconv" 25 "strings" 26 "sync" 27 28 "k8s.io/apimachinery/pkg/util/sets" 29 "k8s.io/contrib/test-utils/utils" 30 cache "k8s.io/test-infra/mungegithub/mungers/flakesync" 31 "k8s.io/test-infra/mungegithub/options" 32 33 "io/ioutil" 34 35 "github.com/golang/glog" 36 ) 37 38 // E2ETester can be queried for E2E job stability. 39 type E2ETester interface { 40 LoadNonBlockingStatus() 41 GetBuildStatus() map[string]BuildInfo 42 Flakes() cache.Flakes 43 } 44 45 // BuildInfo tells the build ID and the build success 46 type BuildInfo struct { 47 Status string 48 ID string 49 } 50 51 // RealE2ETester is the object which will get status from a google bucket 52 // information about recent jobs 53 type RealE2ETester struct { 54 Opts *options.Options 55 NonBlockingJobNames *[]string 56 57 sync.Mutex 58 BuildStatus map[string]BuildInfo // protect by mutex 59 GoogleGCSBucketUtils *utils.Utils 60 61 flakeCache *cache.Cache 62 resolutionTracker *ResolutionTracker 63 } 64 65 // HTTPHandlerInstaller is anything that can hook up HTTP requests to handlers. 66 // Used for installing admin functions. 67 type HTTPHandlerInstaller interface { 68 HandleFunc(pattern string, handler func(http.ResponseWriter, *http.Request)) 69 } 70 71 // Init does construction-- call once it after setting the public fields of 'e'. 72 // adminMux may be nil, in which case handlers for the resolution tracker won't 73 // be installed. 74 func (e *RealE2ETester) Init(adminMux HTTPHandlerInstaller) *RealE2ETester { 75 e.flakeCache = cache.NewCache(e.getGCSResult) 76 e.resolutionTracker = NewResolutionTracker() 77 if adminMux != nil { 78 adminMux.HandleFunc("/api/mark-resolved", e.resolutionTracker.SetHTTP) 79 adminMux.HandleFunc("/api/is-resolved", e.resolutionTracker.GetHTTP) 80 adminMux.HandleFunc("/api/list-resolutions", e.resolutionTracker.ListHTTP) 81 } 82 return e 83 } 84 85 func (e *RealE2ETester) locked(f func()) { 86 e.Lock() 87 defer e.Unlock() 88 f() 89 } 90 91 // GetBuildStatus returns the build status. This map is a copy and is thus safe 92 // for the caller to use in any way. 93 func (e *RealE2ETester) GetBuildStatus() map[string]BuildInfo { 94 e.Lock() 95 defer e.Unlock() 96 out := map[string]BuildInfo{} 97 for k, v := range e.BuildStatus { 98 out[k] = v 99 } 100 return out 101 } 102 103 // Flakes returns a sorted list of current flakes. 104 func (e *RealE2ETester) Flakes() cache.Flakes { 105 return e.flakeCache.Flakes() 106 } 107 108 func (e *RealE2ETester) setBuildStatus(build, status string, id string) { 109 e.Lock() 110 defer e.Unlock() 111 e.BuildStatus[build] = BuildInfo{Status: status, ID: id} 112 } 113 114 const ( 115 // ExpectedXMLHeader is the expected header of junit_XX.xml file 116 ExpectedXMLHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" 117 ) 118 119 // GetBuildResult returns (or gets) the cached result of the job and build. Public. 120 func (e *RealE2ETester) GetBuildResult(job string, number int) (*cache.Result, error) { 121 return e.flakeCache.Get(cache.Job(job), cache.Number(number)) 122 } 123 124 func (e *RealE2ETester) getGCSResult(j cache.Job, n cache.Number) (*cache.Result, error) { 125 // The difference between pre- and post-submit tests is that in the 126 // former, we look for flakes when they pass, and in the latter, we 127 // look for flakes when they fail. This is because presubmit tests will 128 // run multiple times and pass if at least one run passed, but 129 // postsubmit tests run each test only once. For postsubmit tests, we 130 // detect flakiness by comparing between runs, but that's not possible 131 // for presubmit tests, because the PR author might have actually 132 // broken something. 133 if strings.Contains(string(j), "pull") { 134 return e.getGCSPresubmitResult(j, n) 135 } 136 return e.getGCSPostsubmitResult(j, n) 137 } 138 139 func (e *RealE2ETester) getGCSPostsubmitResult(j cache.Job, n cache.Number) (*cache.Result, error) { 140 stable, err := e.GoogleGCSBucketUtils.CheckFinishedStatus(string(j), int(n)) 141 if err != nil { 142 glog.V(4).Infof("Error looking up job: %v, build number: %v", j, n) 143 // Not actually fatal! 144 } 145 r := &cache.Result{ 146 Job: j, 147 Number: n, 148 // TODO: StartTime: 149 } 150 if stable { 151 r.Status = cache.ResultStable 152 return r, nil 153 } 154 155 // This isn't stable-- see if we can find a reason. 156 thisFailures, err := e.failureReasons(string(j), int(n), true) 157 if err != nil { 158 glog.V(4).Infof("Error looking up job failure reasons: %v, build number: %v: %v", j, n, err) 159 thisFailures = nil // ensure we fall through 160 } 161 if len(thisFailures) == 0 { 162 r.Status = cache.ResultFailed 163 // Don't return any flake information, to reduce flake noise -- getting an issue opened 164 // for every failed run without logs is not useful. 165 return r, nil 166 } 167 168 r.Flakes = map[cache.Test]string{} 169 for testName, reason := range thisFailures { 170 r.Flakes[cache.Test(testName)] = reason 171 } 172 173 r.Status = cache.ResultFlaky 174 return r, nil 175 } 176 177 func (e *RealE2ETester) getGCSPresubmitResult(j cache.Job, n cache.Number) (*cache.Result, error) { 178 stable, err := e.GoogleGCSBucketUtils.CheckFinishedStatus(string(j), int(n)) 179 if err != nil { 180 return nil, fmt.Errorf("error looking up job: %v, build number: %v", j, n) 181 } 182 r := &cache.Result{ 183 Status: cache.ResultStable, 184 Job: j, 185 Number: n, 186 } 187 if !stable { 188 r.Status = cache.ResultFailed 189 // We do *not* add a "run completely broken" flake entry since 190 // this is presumably the author's fault, and we don't want to 191 // file issues for things like that. 192 return r, nil 193 } 194 195 // Check to see if there were any individual failures (even though the 196 // run as a whole succeeded). 197 thisFailures, err := e.failureReasons(string(j), int(n), true) 198 if err != nil { 199 glog.V(2).Infof("Error looking up job failure reasons: %v, build number: %v: %v", j, n, err) 200 return r, nil 201 } 202 if len(thisFailures) == 0 { 203 glog.V(2).Infof("No flakes in %v/%v.", j, n) 204 return r, nil 205 } 206 207 r.Flakes = map[cache.Test]string{} 208 for testName, reason := range thisFailures { 209 r.Flakes[cache.Test(testName)] = reason 210 } 211 212 r.Status = cache.ResultFlaky 213 return r, nil 214 } 215 216 func (e *RealE2ETester) checkPassFail(job string, number int) (stable, ignorableFlakes bool) { 217 if e.resolutionTracker.Resolved(cache.Job(job), cache.Number(number)) { 218 e.setBuildStatus(job, "Problem Resolved", strconv.Itoa(number)) 219 return true, true 220 } 221 222 thisResult, err := e.GetBuildResult(job, number) 223 if err != nil || thisResult.Status == cache.ResultFailed { 224 glog.V(4).Infof("Found unstable job: %v, build number: %v: (err: %v) %#v", job, number, err, thisResult) 225 e.setBuildStatus(job, "Not Stable", strconv.Itoa(number)) 226 return false, false 227 } 228 229 if thisResult.Status == cache.ResultStable { 230 e.setBuildStatus(job, "Stable", strconv.Itoa(number)) 231 return true, false 232 } 233 234 lastResult, err := e.GetBuildResult(job, number-1) 235 if err != nil || lastResult.Status == cache.ResultFailed { 236 glog.V(4).Infof("prev job doesn't help: %v, build number: %v (the previous build); (err %v) %#v", job, number-1, err, lastResult) 237 e.setBuildStatus(job, "Not Stable", strconv.Itoa(number)) 238 return true, false 239 } 240 241 if lastResult.Status == cache.ResultStable { 242 e.setBuildStatus(job, "Ignorable flake", strconv.Itoa(number)) 243 return true, true 244 } 245 246 intersection := sets.NewString() 247 for testName := range thisResult.Flakes { 248 if _, ok := lastResult.Flakes[testName]; ok { 249 intersection.Insert(string(testName)) 250 } 251 } 252 if len(intersection) == 0 { 253 glog.V(2).Infof("Ignoring failure of %v/%v since it didn't happen the previous run this run = %v; prev run = %v.", job, number, thisResult.Flakes, lastResult.Flakes) 254 e.setBuildStatus(job, "Ignorable flake", strconv.Itoa(number)) 255 return true, true 256 } 257 glog.V(2).Infof("Failure of %v/%v is legit. Tests that failed multiple times in a row: %v", job, number, intersection) 258 e.setBuildStatus(job, "Not Stable", strconv.Itoa(number)) 259 return false, false 260 } 261 262 // LatestRunOfJob returns the number of the most recent completed run of the given job. 263 func (e *RealE2ETester) LatestRunOfJob(jobName string) (int, error) { 264 return e.GoogleGCSBucketUtils.GetLastestBuildNumberFromJenkinsGoogleBucket(jobName) 265 } 266 267 // LoadNonBlockingStatus gets the build stability status for all the NonBlockingJobNames. 268 func (e *RealE2ETester) LoadNonBlockingStatus() { 269 e.Opts.Lock() 270 jobs := *e.NonBlockingJobNames 271 e.Opts.Unlock() 272 for _, job := range jobs { 273 lastBuildNumber, err := e.GoogleGCSBucketUtils.GetLastestBuildNumberFromJenkinsGoogleBucket(job) 274 glog.V(4).Infof("Checking status of %v, %v", job, lastBuildNumber) 275 if err != nil { 276 glog.Errorf("Error while getting data for %v: %v", job, err) 277 e.setBuildStatus(job, "[nonblocking] Not Stable", strconv.Itoa(lastBuildNumber)) 278 continue 279 } 280 281 if thisResult, err := e.GetBuildResult(job, lastBuildNumber); err != nil || thisResult.Status != cache.ResultStable { 282 e.setBuildStatus(job, "[nonblocking] Not Stable", strconv.Itoa(lastBuildNumber)) 283 } else { 284 e.setBuildStatus(job, "[nonblocking] Stable", strconv.Itoa(lastBuildNumber)) 285 } 286 } 287 } 288 289 func getJUnitFailures(r io.Reader) (failures map[string]string, err error) { 290 type Testcase struct { 291 Name string `xml:"name,attr"` 292 ClassName string `xml:"classname,attr"` 293 Failure string `xml:"failure"` 294 } 295 type Testsuite struct { 296 TestCount int `xml:"tests,attr"` 297 FailCount int `xml:"failures,attr"` 298 Testcases []Testcase `xml:"testcase"` 299 } 300 type Testsuites struct { 301 TestSuites []Testsuite `xml:"testsuite"` 302 } 303 var testSuiteList []Testsuite 304 failures = map[string]string{} 305 testSuites := &Testsuites{} 306 testSuite := &Testsuite{} 307 b, err := ioutil.ReadAll(r) 308 if err != nil { 309 return failures, err 310 } 311 // first try to parse the result with <testsuites> as top tag 312 err = xml.Unmarshal(b, testSuites) 313 if err == nil && len(testSuites.TestSuites) > 0 { 314 testSuiteList = testSuites.TestSuites 315 } else { 316 // second try to parse the result with <testsuite> as top tag 317 err = xml.Unmarshal(b, testSuite) 318 if err != nil { 319 return nil, err 320 } 321 testSuiteList = []Testsuite{*testSuite} 322 } 323 for _, ts := range testSuiteList { 324 for _, tc := range ts.Testcases { 325 if tc.Failure != "" { 326 failures[fmt.Sprintf("%v {%v}", tc.Name, tc.ClassName)] = tc.Failure 327 } 328 } 329 } 330 return failures, nil 331 } 332 333 // If completeList is true, collect every failure reason. Otherwise exit as soon as you see any failure. 334 func (e *RealE2ETester) failureReasons(job string, buildNumber int, completeList bool) (failedTests map[string]string, err error) { 335 failuresFromResp := func(resp *http.Response) (failures map[string]string, err error) { 336 defer resp.Body.Close() 337 return getJUnitFailures(resp.Body) 338 } 339 failedTests = map[string]string{} 340 341 // junit file prefix 342 prefix := "artifacts/junit" 343 junitList, err := e.GoogleGCSBucketUtils.ListFilesInBuild(job, buildNumber, prefix) 344 if err != nil { 345 glog.Errorf("Failed to list junit files for %v/%v/%v: %v", job, buildNumber, prefix, err) 346 } 347 348 // If we're here it means that build failed, so we need to look for a reason 349 // by iterating over junit*.xml files and look for failures 350 for _, filePath := range junitList { 351 // if do not need complete list and we already have failed tests, then return 352 if !completeList && len(failedTests) > 0 { 353 break 354 } 355 if !strings.HasSuffix(filePath, ".xml") { 356 continue 357 } 358 split := strings.Split(filePath, "/") 359 junitFilePath := fmt.Sprintf("artifacts/%s", split[len(split)-1]) 360 response, err := e.GoogleGCSBucketUtils.GetFileFromJenkinsGoogleBucket(job, buildNumber, junitFilePath) 361 if err != nil { 362 return nil, fmt.Errorf("error while getting data for %v/%v/%v: %v", job, buildNumber, junitFilePath, err) 363 } 364 if response.StatusCode != http.StatusOK { 365 response.Body.Close() 366 break 367 } 368 failures, err := failuresFromResp(response) // closes response.Body for us 369 if err != nil { 370 return nil, fmt.Errorf("failed to read the response for %v/%v/%v: %v", job, buildNumber, junitFilePath, err) 371 } 372 for k, v := range failures { 373 failedTests[k] = v 374 } 375 } 376 377 return failedTests, nil 378 }