github.com/abayer/test-infra@v0.0.5/mungegithub/mungers/e2e/e2e.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2e
    18  
    19  import (
    20  	"encoding/xml"
    21  	"fmt"
    22  	"io"
    23  	"net/http"
    24  	"strconv"
    25  	"strings"
    26  	"sync"
    27  
    28  	"k8s.io/apimachinery/pkg/util/sets"
    29  	"k8s.io/contrib/test-utils/utils"
    30  	cache "k8s.io/test-infra/mungegithub/mungers/flakesync"
    31  	"k8s.io/test-infra/mungegithub/options"
    32  
    33  	"io/ioutil"
    34  
    35  	"github.com/golang/glog"
    36  )
    37  
    38  // E2ETester can be queried for E2E job stability.
    39  type E2ETester interface {
    40  	LoadNonBlockingStatus()
    41  	GetBuildStatus() map[string]BuildInfo
    42  	Flakes() cache.Flakes
    43  }
    44  
    45  // BuildInfo tells the build ID and the build success
    46  type BuildInfo struct {
    47  	Status string
    48  	ID     string
    49  }
    50  
    51  // RealE2ETester is the object which will get status from a google bucket
    52  // information about recent jobs
    53  type RealE2ETester struct {
    54  	Opts                *options.Options
    55  	NonBlockingJobNames *[]string
    56  
    57  	sync.Mutex
    58  	BuildStatus          map[string]BuildInfo // protect by mutex
    59  	GoogleGCSBucketUtils *utils.Utils
    60  
    61  	flakeCache        *cache.Cache
    62  	resolutionTracker *ResolutionTracker
    63  }
    64  
    65  // HTTPHandlerInstaller is anything that can hook up HTTP requests to handlers.
    66  // Used for installing admin functions.
    67  type HTTPHandlerInstaller interface {
    68  	HandleFunc(pattern string, handler func(http.ResponseWriter, *http.Request))
    69  }
    70  
    71  // Init does construction-- call once it after setting the public fields of 'e'.
    72  // adminMux may be nil, in which case handlers for the resolution tracker won't
    73  // be installed.
    74  func (e *RealE2ETester) Init(adminMux HTTPHandlerInstaller) *RealE2ETester {
    75  	e.flakeCache = cache.NewCache(e.getGCSResult)
    76  	e.resolutionTracker = NewResolutionTracker()
    77  	if adminMux != nil {
    78  		adminMux.HandleFunc("/api/mark-resolved", e.resolutionTracker.SetHTTP)
    79  		adminMux.HandleFunc("/api/is-resolved", e.resolutionTracker.GetHTTP)
    80  		adminMux.HandleFunc("/api/list-resolutions", e.resolutionTracker.ListHTTP)
    81  	}
    82  	return e
    83  }
    84  
    85  func (e *RealE2ETester) locked(f func()) {
    86  	e.Lock()
    87  	defer e.Unlock()
    88  	f()
    89  }
    90  
    91  // GetBuildStatus returns the build status. This map is a copy and is thus safe
    92  // for the caller to use in any way.
    93  func (e *RealE2ETester) GetBuildStatus() map[string]BuildInfo {
    94  	e.Lock()
    95  	defer e.Unlock()
    96  	out := map[string]BuildInfo{}
    97  	for k, v := range e.BuildStatus {
    98  		out[k] = v
    99  	}
   100  	return out
   101  }
   102  
   103  // Flakes returns a sorted list of current flakes.
   104  func (e *RealE2ETester) Flakes() cache.Flakes {
   105  	return e.flakeCache.Flakes()
   106  }
   107  
   108  func (e *RealE2ETester) setBuildStatus(build, status string, id string) {
   109  	e.Lock()
   110  	defer e.Unlock()
   111  	e.BuildStatus[build] = BuildInfo{Status: status, ID: id}
   112  }
   113  
   114  const (
   115  	// ExpectedXMLHeader is the expected header of junit_XX.xml file
   116  	ExpectedXMLHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
   117  )
   118  
   119  // GetBuildResult returns (or gets) the cached result of the job and build. Public.
   120  func (e *RealE2ETester) GetBuildResult(job string, number int) (*cache.Result, error) {
   121  	return e.flakeCache.Get(cache.Job(job), cache.Number(number))
   122  }
   123  
   124  func (e *RealE2ETester) getGCSResult(j cache.Job, n cache.Number) (*cache.Result, error) {
   125  	// The difference between pre- and post-submit tests is that in the
   126  	// former, we look for flakes when they pass, and in the latter, we
   127  	// look for flakes when they fail. This is because presubmit tests will
   128  	// run multiple times and pass if at least one run passed, but
   129  	// postsubmit tests run each test only once. For postsubmit tests, we
   130  	// detect flakiness by comparing between runs, but that's not possible
   131  	// for presubmit tests, because the PR author might have actually
   132  	// broken something.
   133  	if strings.Contains(string(j), "pull") {
   134  		return e.getGCSPresubmitResult(j, n)
   135  	}
   136  	return e.getGCSPostsubmitResult(j, n)
   137  }
   138  
   139  func (e *RealE2ETester) getGCSPostsubmitResult(j cache.Job, n cache.Number) (*cache.Result, error) {
   140  	stable, err := e.GoogleGCSBucketUtils.CheckFinishedStatus(string(j), int(n))
   141  	if err != nil {
   142  		glog.V(4).Infof("Error looking up job: %v, build number: %v", j, n)
   143  		// Not actually fatal!
   144  	}
   145  	r := &cache.Result{
   146  		Job:    j,
   147  		Number: n,
   148  		// TODO: StartTime:
   149  	}
   150  	if stable {
   151  		r.Status = cache.ResultStable
   152  		return r, nil
   153  	}
   154  
   155  	// This isn't stable-- see if we can find a reason.
   156  	thisFailures, err := e.failureReasons(string(j), int(n), true)
   157  	if err != nil {
   158  		glog.V(4).Infof("Error looking up job failure reasons: %v, build number: %v: %v", j, n, err)
   159  		thisFailures = nil // ensure we fall through
   160  	}
   161  	if len(thisFailures) == 0 {
   162  		r.Status = cache.ResultFailed
   163  		// Don't return any flake information, to reduce flake noise -- getting an issue opened
   164  		// for every failed run without logs is not useful.
   165  		return r, nil
   166  	}
   167  
   168  	r.Flakes = map[cache.Test]string{}
   169  	for testName, reason := range thisFailures {
   170  		r.Flakes[cache.Test(testName)] = reason
   171  	}
   172  
   173  	r.Status = cache.ResultFlaky
   174  	return r, nil
   175  }
   176  
   177  func (e *RealE2ETester) getGCSPresubmitResult(j cache.Job, n cache.Number) (*cache.Result, error) {
   178  	stable, err := e.GoogleGCSBucketUtils.CheckFinishedStatus(string(j), int(n))
   179  	if err != nil {
   180  		return nil, fmt.Errorf("error looking up job: %v, build number: %v", j, n)
   181  	}
   182  	r := &cache.Result{
   183  		Status: cache.ResultStable,
   184  		Job:    j,
   185  		Number: n,
   186  	}
   187  	if !stable {
   188  		r.Status = cache.ResultFailed
   189  		// We do *not* add a "run completely broken" flake entry since
   190  		// this is presumably the author's fault, and we don't want to
   191  		// file issues for things like that.
   192  		return r, nil
   193  	}
   194  
   195  	// Check to see if there were any individual failures (even though the
   196  	// run as a whole succeeded).
   197  	thisFailures, err := e.failureReasons(string(j), int(n), true)
   198  	if err != nil {
   199  		glog.V(2).Infof("Error looking up job failure reasons: %v, build number: %v: %v", j, n, err)
   200  		return r, nil
   201  	}
   202  	if len(thisFailures) == 0 {
   203  		glog.V(2).Infof("No flakes in %v/%v.", j, n)
   204  		return r, nil
   205  	}
   206  
   207  	r.Flakes = map[cache.Test]string{}
   208  	for testName, reason := range thisFailures {
   209  		r.Flakes[cache.Test(testName)] = reason
   210  	}
   211  
   212  	r.Status = cache.ResultFlaky
   213  	return r, nil
   214  }
   215  
   216  func (e *RealE2ETester) checkPassFail(job string, number int) (stable, ignorableFlakes bool) {
   217  	if e.resolutionTracker.Resolved(cache.Job(job), cache.Number(number)) {
   218  		e.setBuildStatus(job, "Problem Resolved", strconv.Itoa(number))
   219  		return true, true
   220  	}
   221  
   222  	thisResult, err := e.GetBuildResult(job, number)
   223  	if err != nil || thisResult.Status == cache.ResultFailed {
   224  		glog.V(4).Infof("Found unstable job: %v, build number: %v: (err: %v) %#v", job, number, err, thisResult)
   225  		e.setBuildStatus(job, "Not Stable", strconv.Itoa(number))
   226  		return false, false
   227  	}
   228  
   229  	if thisResult.Status == cache.ResultStable {
   230  		e.setBuildStatus(job, "Stable", strconv.Itoa(number))
   231  		return true, false
   232  	}
   233  
   234  	lastResult, err := e.GetBuildResult(job, number-1)
   235  	if err != nil || lastResult.Status == cache.ResultFailed {
   236  		glog.V(4).Infof("prev job doesn't help: %v, build number: %v (the previous build); (err %v) %#v", job, number-1, err, lastResult)
   237  		e.setBuildStatus(job, "Not Stable", strconv.Itoa(number))
   238  		return true, false
   239  	}
   240  
   241  	if lastResult.Status == cache.ResultStable {
   242  		e.setBuildStatus(job, "Ignorable flake", strconv.Itoa(number))
   243  		return true, true
   244  	}
   245  
   246  	intersection := sets.NewString()
   247  	for testName := range thisResult.Flakes {
   248  		if _, ok := lastResult.Flakes[testName]; ok {
   249  			intersection.Insert(string(testName))
   250  		}
   251  	}
   252  	if len(intersection) == 0 {
   253  		glog.V(2).Infof("Ignoring failure of %v/%v since it didn't happen the previous run this run = %v; prev run = %v.", job, number, thisResult.Flakes, lastResult.Flakes)
   254  		e.setBuildStatus(job, "Ignorable flake", strconv.Itoa(number))
   255  		return true, true
   256  	}
   257  	glog.V(2).Infof("Failure of %v/%v is legit. Tests that failed multiple times in a row: %v", job, number, intersection)
   258  	e.setBuildStatus(job, "Not Stable", strconv.Itoa(number))
   259  	return false, false
   260  }
   261  
   262  // LatestRunOfJob returns the number of the most recent completed run of the given job.
   263  func (e *RealE2ETester) LatestRunOfJob(jobName string) (int, error) {
   264  	return e.GoogleGCSBucketUtils.GetLastestBuildNumberFromJenkinsGoogleBucket(jobName)
   265  }
   266  
   267  // LoadNonBlockingStatus gets the build stability status for all the NonBlockingJobNames.
   268  func (e *RealE2ETester) LoadNonBlockingStatus() {
   269  	e.Opts.Lock()
   270  	jobs := *e.NonBlockingJobNames
   271  	e.Opts.Unlock()
   272  	for _, job := range jobs {
   273  		lastBuildNumber, err := e.GoogleGCSBucketUtils.GetLastestBuildNumberFromJenkinsGoogleBucket(job)
   274  		glog.V(4).Infof("Checking status of %v, %v", job, lastBuildNumber)
   275  		if err != nil {
   276  			glog.Errorf("Error while getting data for %v: %v", job, err)
   277  			e.setBuildStatus(job, "[nonblocking] Not Stable", strconv.Itoa(lastBuildNumber))
   278  			continue
   279  		}
   280  
   281  		if thisResult, err := e.GetBuildResult(job, lastBuildNumber); err != nil || thisResult.Status != cache.ResultStable {
   282  			e.setBuildStatus(job, "[nonblocking] Not Stable", strconv.Itoa(lastBuildNumber))
   283  		} else {
   284  			e.setBuildStatus(job, "[nonblocking] Stable", strconv.Itoa(lastBuildNumber))
   285  		}
   286  	}
   287  }
   288  
   289  func getJUnitFailures(r io.Reader) (failures map[string]string, err error) {
   290  	type Testcase struct {
   291  		Name      string `xml:"name,attr"`
   292  		ClassName string `xml:"classname,attr"`
   293  		Failure   string `xml:"failure"`
   294  	}
   295  	type Testsuite struct {
   296  		TestCount int        `xml:"tests,attr"`
   297  		FailCount int        `xml:"failures,attr"`
   298  		Testcases []Testcase `xml:"testcase"`
   299  	}
   300  	type Testsuites struct {
   301  		TestSuites []Testsuite `xml:"testsuite"`
   302  	}
   303  	var testSuiteList []Testsuite
   304  	failures = map[string]string{}
   305  	testSuites := &Testsuites{}
   306  	testSuite := &Testsuite{}
   307  	b, err := ioutil.ReadAll(r)
   308  	if err != nil {
   309  		return failures, err
   310  	}
   311  	// first try to parse the result with <testsuites> as top tag
   312  	err = xml.Unmarshal(b, testSuites)
   313  	if err == nil && len(testSuites.TestSuites) > 0 {
   314  		testSuiteList = testSuites.TestSuites
   315  	} else {
   316  		// second try to parse the result with <testsuite> as top tag
   317  		err = xml.Unmarshal(b, testSuite)
   318  		if err != nil {
   319  			return nil, err
   320  		}
   321  		testSuiteList = []Testsuite{*testSuite}
   322  	}
   323  	for _, ts := range testSuiteList {
   324  		for _, tc := range ts.Testcases {
   325  			if tc.Failure != "" {
   326  				failures[fmt.Sprintf("%v {%v}", tc.Name, tc.ClassName)] = tc.Failure
   327  			}
   328  		}
   329  	}
   330  	return failures, nil
   331  }
   332  
   333  // If completeList is true, collect every failure reason. Otherwise exit as soon as you see any failure.
   334  func (e *RealE2ETester) failureReasons(job string, buildNumber int, completeList bool) (failedTests map[string]string, err error) {
   335  	failuresFromResp := func(resp *http.Response) (failures map[string]string, err error) {
   336  		defer resp.Body.Close()
   337  		return getJUnitFailures(resp.Body)
   338  	}
   339  	failedTests = map[string]string{}
   340  
   341  	// junit file prefix
   342  	prefix := "artifacts/junit"
   343  	junitList, err := e.GoogleGCSBucketUtils.ListFilesInBuild(job, buildNumber, prefix)
   344  	if err != nil {
   345  		glog.Errorf("Failed to list junit files for %v/%v/%v: %v", job, buildNumber, prefix, err)
   346  	}
   347  
   348  	// If we're here it means that build failed, so we need to look for a reason
   349  	// by iterating over junit*.xml files and look for failures
   350  	for _, filePath := range junitList {
   351  		// if do not need complete list and we already have failed tests, then return
   352  		if !completeList && len(failedTests) > 0 {
   353  			break
   354  		}
   355  		if !strings.HasSuffix(filePath, ".xml") {
   356  			continue
   357  		}
   358  		split := strings.Split(filePath, "/")
   359  		junitFilePath := fmt.Sprintf("artifacts/%s", split[len(split)-1])
   360  		response, err := e.GoogleGCSBucketUtils.GetFileFromJenkinsGoogleBucket(job, buildNumber, junitFilePath)
   361  		if err != nil {
   362  			return nil, fmt.Errorf("error while getting data for %v/%v/%v: %v", job, buildNumber, junitFilePath, err)
   363  		}
   364  		if response.StatusCode != http.StatusOK {
   365  			response.Body.Close()
   366  			break
   367  		}
   368  		failures, err := failuresFromResp(response) // closes response.Body for us
   369  		if err != nil {
   370  			return nil, fmt.Errorf("failed to read the response for %v/%v/%v: %v", job, buildNumber, junitFilePath, err)
   371  		}
   372  		for k, v := range failures {
   373  			failedTests[k] = v
   374  		}
   375  	}
   376  
   377  	return failedTests, nil
   378  }