github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/robots/issue-creator/sources/triage-filer.go

github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/robots/issue-creator/sources/triage-filer.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package sources
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/json"
    22  	"flag"
    23  	"fmt"
    24  	"reflect"
    25  	"sort"
    26  	"strconv"
    27  	"strings"
    28  	"time"
    29  
    30  	githubapi "github.com/google/go-github/github"
    31  	"k8s.io/test-infra/mungegithub/mungers/mungerutil"
    32  	"k8s.io/test-infra/robots/issue-creator/creator"
    33  )
    34  
    35  const (
    36  	timeFormat = "2 Jan 2006 15:04 MST"
    37  
    38  	// Configuration constants.
    39  	topJobsCount   = 3
    40  	topTestsCount  = 3
    41  	triageURL      = "https://go.k8s.io/triage"
    42  	clusterDataURL = "https://storage.googleapis.com/k8s-gubernator/triage/failure_data.json"
    43  )
    44  
    45  // TriageFiler files issues for clustered test failures.
    46  type TriageFiler struct {
    47  	topClustersCount int
    48  	windowDays       int
    49  
    50  	nextSync    time.Time
    51  	latestStart int64
    52  
    53  	creator *creator.IssueCreator
    54  	data    *triageData
    55  }
    56  
    57  func init() {
    58  	creator.RegisterSourceOrDie("triage-filer", &TriageFiler{})
    59  }
    60  
    61  // FileIssues is the main work function of the TriageFiler.  It fetches and parses cluster data,
    62  // then syncs the top issues to github with the IssueCreator.
    63  func (f *TriageFiler) Issues(c *creator.IssueCreator) ([]creator.Issue, error) {
    64  	f.creator = c
    65  	rawjson, err := mungerutil.ReadHTTP(clusterDataURL)
    66  	if err != nil {
    67  		return nil, err
    68  	}
    69  	clusters, err := f.loadClusters(rawjson)
    70  	if err != nil {
    71  		return nil, err
    72  	}
    73  	topclusters := topClusters(clusters, f.topClustersCount)
    74  	issues := make([]creator.Issue, 0, len(topclusters))
    75  	for _, clust := range topclusters {
    76  		issues = append(issues, clust)
    77  	}
    78  	return issues, nil
    79  }
    80  
    81  // RegisterOptions registers options for this munger; returns any that require a restart when changed.
    82  func (f *TriageFiler) RegisterFlags() {
    83  	flag.IntVar(&f.topClustersCount, "triage-count", 3, "The number of clusters to sync issues for on github.")
    84  	flag.IntVar(&f.windowDays, "triage-window", 1, "The size of the sliding time window (in days) that is used to determine which failures to consider.")
    85  }
    86  
    87  // triageData is a struct that represents the format of the JSON triage data and is used for parsing.
    88  type triageData struct {
    89  	Builds struct {
    90  		Cols struct {
    91  			Elapsed     []int    `json:"elapsed"`
    92  			Executor    []string `json:"executor"`
    93  			PR          []string `json:"pr"`
    94  			Result      []string `json:"result"`
    95  			Started     []int64  `json:"started"`
    96  			TestsFailed []int    `json:"tests_failed"`
    97  			TestsRun    []int    `json:"tests_run"`
    98  		} `json:"cols"`
    99  		JobsRaw  map[string]interface{} `json:"jobs"` // []int or map[string]int
   100  		Jobs     map[string]BuildIndexer
   101  		JobPaths map[string]string `json:"job_paths"`
   102  	} `json:"builds"`
   103  	Clustered []*Cluster `json:"clustered"`
   104  }
   105  
   106  type Cluster struct {
   107  	Id    string  `json:"id"`
   108  	Key   string  `json:"key"`
   109  	Text  string  `json:"text"`
   110  	Tests []*Test `json:"tests"`
   111  
   112  	filer       *TriageFiler
   113  	jobs        map[string][]int
   114  	totalBuilds int
   115  	totalJobs   int
   116  	totalTests  int
   117  }
   118  
   119  type Test struct {
   120  	Name string `json:"name"`
   121  	Jobs []*Job `json:"jobs"`
   122  }
   123  
   124  type Job struct {
   125  	Name   string `json:"name"`
   126  	Builds []int  `json:"builds"`
   127  }
   128  
   129  // filterAndValidate removes failure data that falls outside the time window and ensures that cluster
   130  // data is well formed. It also removes data for PR jobs so that only post-submit failures are considered.
   131  func (f *TriageFiler) filterAndValidate(windowDays int) error {
   132  	f.latestStart = int64(0)
   133  	for _, start := range f.data.Builds.Cols.Started {
   134  		if start > f.latestStart {
   135  			f.latestStart = start
   136  		}
   137  	}
   138  	cutoffTime := time.Unix(f.latestStart, 0).AddDate(0, 0, -windowDays).Unix()
   139  
   140  	validClusts := []*Cluster{}
   141  	for clustIndex, clust := range f.data.Clustered {
   142  		if len(clust.Id) == 0 {
   143  			return fmt.Errorf("the cluster at index %d in the triage JSON data does not specify an Id.", clustIndex)
   144  		}
   145  		if clust.Tests == nil {
   146  			return fmt.Errorf("cluster '%s' does not have a 'tests' key.", clust.Id)
   147  		}
   148  		validTests := []*Test{}
   149  		for _, test := range clust.Tests {
   150  			if len(test.Name) == 0 {
   151  				return fmt.Errorf("cluster '%s' contains a test without a name.", clust.Id)
   152  			}
   153  			if test.Jobs == nil {
   154  				return fmt.Errorf("cluster '%s' does not have a 'jobs' key.", clust.Id)
   155  			}
   156  			validJobs := []*Job{}
   157  			for _, job := range test.Jobs {
   158  				if len(job.Name) == 0 {
   159  					return fmt.Errorf("cluster '%s' contains a job without a name under test '%s'.", clust.Id, test.Name)
   160  				}
   161  				// Filter out PR jobs
   162  				if strings.HasPrefix(job.Name, "pr:") {
   163  					continue
   164  				}
   165  				if len(job.Builds) == 0 {
   166  					return fmt.Errorf("cluster '%s' contains job '%s' under test '%s' with no failing builds.", clust.Id, job.Name, test.Name)
   167  				}
   168  				validBuilds := []int{}
   169  				rowMap, ok := f.data.Builds.Jobs[job.Name]
   170  				if !ok {
   171  					return fmt.Errorf("triage json data does not contain buildnum to row index mapping for job '%s'.", job.Name)
   172  				}
   173  				for _, buildnum := range job.Builds {
   174  					row, err := rowMap.rowForBuild(buildnum)
   175  					if err != nil {
   176  						return err
   177  					}
   178  					if f.data.Builds.Cols.Started[row] > cutoffTime {
   179  						validBuilds = append(validBuilds, buildnum)
   180  					}
   181  				}
   182  				if len(validBuilds) > 0 {
   183  					job.Builds = validBuilds
   184  					validJobs = append(validJobs, job)
   185  				}
   186  			}
   187  			if len(validJobs) > 0 {
   188  				test.Jobs = validJobs
   189  				validTests = append(validTests, test)
   190  			}
   191  		}
   192  		if len(validTests) > 0 {
   193  			clust.Tests = validTests
   194  			validClusts = append(validClusts, clust)
   195  		}
   196  	}
   197  	f.data.Clustered = validClusts
   198  	return nil
   199  }
   200  
   201  // BuildIndexer is an interface that describes the buildnum to row index mapping used to retrieve data
   202  // about individual builds from the JSON file.
   203  // This is an interface because the JSON format describing failure clusters has 2 ways of recording the mapping info.
   204  type BuildIndexer interface {
   205  	rowForBuild(buildnum int) (int, error)
   206  }
   207  
   208  // ContigIndexer is a BuildIndexer implementation for when the buildnum to row index mapping describes
   209  // a contiguous set of rows via 3 ints.
   210  type ContigIndexer struct {
   211  	startRow, startBuild, count int
   212  }
   213  
   214  func (rowMap ContigIndexer) rowForBuild(buildnum int) (int, error) {
   215  	if buildnum < rowMap.startBuild || buildnum > rowMap.startBuild+rowMap.count-1 {
   216  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid.", buildnum)
   217  	}
   218  	return buildnum - rowMap.startBuild + rowMap.startRow, nil
   219  }
   220  
   221  // DictIndexer is a BuildIndexer implementation for when the buildnum to row index mapping is simply a dictionary.
   222  // The value type of this dictionary is interface instead of int so that we don't have to convert the original map.
   223  type DictIndexer map[string]interface{}
   224  
   225  func (rowMap DictIndexer) rowForBuild(buildnum int) (int, error) {
   226  	row, ok := rowMap[strconv.Itoa(buildnum)]
   227  	if !ok {
   228  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid.", buildnum)
   229  	}
   230  	var irow float64
   231  	if irow, ok = row.(float64); !ok {
   232  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping contains invalid type.", buildnum)
   233  	}
   234  	return int(irow), nil
   235  }
   236  
   237  // loadClusters parses and filters the json data, then populates every Cluster struct with
   238  // aggregated job data and totals. The job data specifies all jobs that failed in a cluster and the
   239  // builds that failed for each job, independent of which tests the jobs or builds failed.
   240  func (f *TriageFiler) loadClusters(jsonIn []byte) ([]*Cluster, error) {
   241  	var err error
   242  	f.data, err = parseTriageData(jsonIn)
   243  	if err != nil {
   244  		return nil, err
   245  	}
   246  	if err = f.filterAndValidate(f.windowDays); err != nil {
   247  		return nil, err
   248  	}
   249  
   250  	// Aggregate failing builds in each cluster by job (independent of tests).
   251  	for _, clust := range f.data.Clustered {
   252  		clust.filer = f
   253  		clust.jobs = make(map[string][]int)
   254  
   255  		for _, test := range clust.Tests {
   256  			for _, job := range test.Jobs {
   257  				for _, buildnum := range job.Builds {
   258  					found := false
   259  					for _, oldBuild := range clust.jobs[job.Name] {
   260  						if oldBuild == buildnum {
   261  							found = true
   262  							break
   263  						}
   264  					}
   265  					if !found {
   266  						clust.jobs[job.Name] = append(clust.jobs[job.Name], buildnum)
   267  					}
   268  				}
   269  			}
   270  		}
   271  		clust.totalJobs = len(clust.jobs)
   272  		clust.totalTests = len(clust.Tests)
   273  		clust.totalBuilds = 0
   274  		for _, builds := range clust.jobs {
   275  			clust.totalBuilds += len(builds)
   276  		}
   277  	}
   278  	return f.data.Clustered, nil
   279  }
   280  
   281  // parseTriageData unmarshals raw json data into a triageData struct and creates a BuildIndexer for
   282  // every job.
   283  func parseTriageData(jsonIn []byte) (*triageData, error) {
   284  	var data triageData
   285  	if err := json.Unmarshal(jsonIn, &data); err != nil {
   286  		return nil, err
   287  	}
   288  
   289  	if data.Builds.Cols.Started == nil {
   290  		return nil, fmt.Errorf("triage data json is missing the builds.cols.started key.")
   291  	}
   292  	if data.Builds.JobsRaw == nil {
   293  		return nil, fmt.Errorf("triage data is missing the builds.jobs key.")
   294  	}
   295  	if data.Builds.JobPaths == nil {
   296  		return nil, fmt.Errorf("triage data is missing the builds.job_paths key.")
   297  	}
   298  	if data.Clustered == nil {
   299  		return nil, fmt.Errorf("triage data is missing the clustered key.")
   300  	}
   301  	// Populate 'Jobs' with the BuildIndexer for each job.
   302  	data.Builds.Jobs = make(map[string]BuildIndexer)
   303  	for jobID, mapper := range data.Builds.JobsRaw {
   304  		switch mapper := mapper.(type) {
   305  		case []interface{}:
   306  			// In this case mapper is a 3 member array. 0:first buildnum, 1:number of builds, 2:start index.
   307  			data.Builds.Jobs[jobID] = ContigIndexer{
   308  				startBuild: int(mapper[0].(float64)),
   309  				count:      int(mapper[1].(float64)),
   310  				startRow:   int(mapper[2].(float64)),
   311  			}
   312  		case map[string]interface{}:
   313  			// In this case mapper is a dictionary.
   314  			data.Builds.Jobs[jobID] = DictIndexer(mapper)
   315  		default:
   316  			return nil, fmt.Errorf("the build number to row index mapping for job '%s' is not an accepted type. Type is: %v", jobID, reflect.TypeOf(mapper))
   317  		}
   318  	}
   319  	return &data, nil
   320  }
   321  
   322  // topClusters gets the 'count' most important clusters from a slice of clusters based on number of build failures.
   323  func topClusters(clusters []*Cluster, count int) []*Cluster {
   324  	less := func(i, j int) bool { return clusters[i].totalBuilds > clusters[j].totalBuilds }
   325  	sort.SliceStable(clusters, less)
   326  
   327  	if len(clusters) < count {
   328  		count = len(clusters)
   329  	}
   330  	return clusters[0:count]
   331  }
   332  
   333  // topTestsFailing returns the top 'count' test names sorted by number of failing jobs.
   334  func (c *Cluster) topTestsFailed(count int) []*Test {
   335  	less := func(i, j int) bool { return len(c.Tests[i].Jobs) > len(c.Tests[j].Jobs) }
   336  	sort.SliceStable(c.Tests, less)
   337  
   338  	if len(c.Tests) < count {
   339  		count = len(c.Tests)
   340  	}
   341  	return c.Tests[0:count]
   342  }
   343  
   344  // topJobsFailed returns the top 'count' job names sorted by number of failing builds.
   345  func (c *Cluster) topJobsFailed(count int) []*Job {
   346  	slice := make([]*Job, len(c.jobs))
   347  	i := 0
   348  	for jobName, builds := range c.jobs {
   349  		slice[i] = &Job{Name: jobName, Builds: builds}
   350  		i++
   351  	}
   352  	less := func(i, j int) bool { return len(slice[i].Builds) > len(slice[j].Builds) }
   353  	sort.SliceStable(slice, less)
   354  
   355  	if len(slice) < count {
   356  		count = len(slice)
   357  	}
   358  	return slice[0:count]
   359  }
   360  
   361  // Title is the string to use as the github issue title.
   362  func (c *Cluster) Title() string {
   363  	return fmt.Sprintf("Failure cluster [%s...] failed %d builds, %d jobs, and %d tests over %d days",
   364  		c.Id[0:6],
   365  		c.totalBuilds,
   366  		c.totalJobs,
   367  		c.totalTests,
   368  		c.filer.windowDays,
   369  	)
   370  }
   371  
   372  // Body returns the body text of the github issue and *must* contain the output of ID().
   373  // closedIssues is a (potentially empty) slice containing all closed issues authored by this bot
   374  // that contain ID() in their body.
   375  // If Body returns an empty string no issue is created.
   376  func (c *Cluster) Body(closedIssues []*githubapi.Issue) string {
   377  	// First check that the most recently closed issue (if any exist) was closed
   378  	// before the start of the sliding window.
   379  	cutoffTime := time.Unix(c.filer.latestStart, 0).AddDate(0, 0, -c.filer.windowDays)
   380  	for _, closed := range closedIssues {
   381  		if closed.ClosedAt.After(cutoffTime) {
   382  			return ""
   383  		}
   384  	}
   385  
   386  	var buf bytes.Buffer
   387  	fmt.Fprintf(&buf, "### Failure cluster [%s](%s#%s)\n", c.ID(), triageURL, c.Id)
   388  	fmt.Fprintf(&buf, "##### Error text:\n```\n%s\n```\n", c.Text)
   389  	// cluster stats
   390  	fmt.Fprint(&buf, "##### Failure cluster statistics:\n")
   391  	fmt.Fprintf(&buf, "%d tests failed,    %d jobs failed,    %d builds failed.\n", c.totalTests, c.totalJobs, c.totalBuilds)
   392  	fmt.Fprintf(&buf, "Failure stats cover %d day time range '%s' to '%s'.\n##### Top failed tests by jobs failed:\n",
   393  		c.filer.windowDays,
   394  		cutoffTime.Format(timeFormat),
   395  		time.Unix(c.filer.latestStart, 0).Format(timeFormat))
   396  	// top tests failed
   397  	fmt.Fprint(&buf, "\n| Test Name | Jobs Failed |\n| --- | --- |\n")
   398  	for _, test := range c.topTestsFailed(topTestsCount) {
   399  		fmt.Fprintf(&buf, "| %s | %d |\n", test.Name, len(test.Jobs))
   400  	}
   401  	// top jobs failed
   402  	fmt.Fprint(&buf, "\n##### Top failed jobs by builds failed:\n")
   403  	fmt.Fprint(&buf, "\n| Job Name | Builds Failed | Latest Failure |\n| --- | --- | --- |\n")
   404  	for _, job := range c.topJobsFailed(topJobsCount) {
   405  		latest := 0
   406  		latestTime := int64(0)
   407  		rowMap := c.filer.data.Builds.Jobs[job.Name]
   408  		for _, build := range job.Builds {
   409  			row, _ := rowMap.rowForBuild(build) // Already validated start time lookup for all builds.
   410  			buildTime := c.filer.data.Builds.Cols.Started[row]
   411  			if buildTime > latestTime {
   412  				latestTime = buildTime
   413  				latest = build
   414  			}
   415  		}
   416  		path := strings.TrimPrefix(c.filer.data.Builds.JobPaths[job.Name], "gs://")
   417  		fmt.Fprintf(&buf, "| %s | %d | [%s](https://k8s-gubernator.appspot.com/build/%s/%d) |\n", job.Name, len(job.Builds), time.Unix(latestTime, 0).Format(timeFormat), path, latest)
   418  	}
   419  	// previously closed issues if there are any
   420  	if len(closedIssues) > 0 {
   421  		fmt.Fprint(&buf, "\n##### Previously closed issues for this cluster:\n")
   422  		for _, closed := range closedIssues {
   423  			fmt.Fprintf(&buf, "#%d ", *closed.Number)
   424  		}
   425  		fmt.Fprint(&buf, "\n")
   426  	}
   427  
   428  	// Create /assign command.
   429  	testNames := make([]string, 0, len(c.Tests))
   430  	for _, test := range c.topTestsFailed(len(c.Tests)) {
   431  		testNames = append(testNames, test.Name)
   432  	}
   433  	ownersMap := c.filer.creator.TestsOwners(testNames)
   434  	if len(ownersMap) > 0 {
   435  		fmt.Fprint(&buf, "\n/assign")
   436  		for user := range ownersMap {
   437  			fmt.Fprintf(&buf, " @%s", user)
   438  		}
   439  		fmt.Fprint(&buf, "\n")
   440  	}
   441  
   442  	// Explanations of assignees and sigs
   443  	fmt.Fprint(&buf, c.filer.creator.ExplainTestAssignments(testNames))
   444  
   445  	fmt.Fprintf(&buf, "\n[Current Status](%s#%s)", triageURL, c.Id)
   446  
   447  	return buf.String()
   448  }
   449  
   450  // ID yields the string identifier that uniquely identifies this issue.
   451  // This ID must appear in the body of the issue.
   452  // DO NOT CHANGE how this ID is formatted or duplicate issues may be created on github.
   453  func (c *Cluster) ID() string {
   454  	return c.Id
   455  }
   456  
   457  // Labels returns the labels to apply to the issue created for this cluster on github.
   458  func (c *Cluster) Labels() []string {
   459  	labels := []string{"kind/flake"}
   460  
   461  	topTests := make([]string, len(c.Tests))
   462  	for i, test := range c.topTestsFailed(len(c.Tests)) {
   463  		topTests[i] = test.Name
   464  	}
   465  	for sig := range c.filer.creator.TestsSIGs(topTests) {
   466  		labels = append(labels, "sig/"+sig)
   467  	}
   468  
   469  	return labels
   470  }
   471  
   472  // Owners returns the list of usernames to assign to this issue on github.
   473  func (c *Cluster) Owners() []string {
   474  	// Assign owners by including a /assign command in the body instead of using Owners to set
   475  	// assignees on the issue request. This lets prow do the assignee validation and will mention
   476  	// the user we want to assign even if they can't be assigned.
   477  	return nil
   478  }
   479  
   480  // Priority calculates and returns the priority of this issue.
   481  // The returned bool indicates if the returned priority is valid and can be used.
   482  func (c *Cluster) Priority() (string, bool) {
   483  	// TODO implement priority calcs later.
   484  	return "", false
   485  }