github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/robots/issue-creator/sources/triage-filer.go

github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/robots/issue-creator/sources/triage-filer.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package sources
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/json"
    22  	"flag"
    23  	"fmt"
    24  	"reflect"
    25  	"sort"
    26  	"strconv"
    27  	"strings"
    28  	"time"
    29  
    30  	githubapi "github.com/google/go-github/github"
    31  	"k8s.io/test-infra/robots/issue-creator/creator"
    32  )
    33  
    34  const (
    35  	timeFormat = "2 Jan 2006 15:04 MST"
    36  
    37  	// Configuration constants.
    38  	topJobsCount   = 3
    39  	topTestsCount  = 3
    40  	triageURL      = "https://go.k8s.io/triage"
    41  	clusterDataURL = "https://storage.googleapis.com/k8s-gubernator/triage/failure_data.json"
    42  )
    43  
    44  // TriageFiler files issues for clustered test failures.
    45  type TriageFiler struct {
    46  	topClustersCount int
    47  	windowDays       int
    48  
    49  	nextSync    time.Time
    50  	latestStart int64
    51  
    52  	creator *creator.IssueCreator
    53  	data    *triageData
    54  }
    55  
    56  func init() {
    57  	creator.RegisterSourceOrDie("triage-filer", &TriageFiler{})
    58  }
    59  
    60  // Issues is the main work function of the TriageFiler.  It fetches and parses cluster data,
    61  // then syncs the top issues to github with the IssueCreator.
    62  func (f *TriageFiler) Issues(c *creator.IssueCreator) ([]creator.Issue, error) {
    63  	f.creator = c
    64  	rawjson, err := ReadHTTP(clusterDataURL)
    65  	if err != nil {
    66  		return nil, err
    67  	}
    68  	clusters, err := f.loadClusters(rawjson)
    69  	if err != nil {
    70  		return nil, err
    71  	}
    72  	topclusters := topClusters(clusters, f.topClustersCount)
    73  	issues := make([]creator.Issue, 0, len(topclusters))
    74  	for _, clust := range topclusters {
    75  		issues = append(issues, clust)
    76  	}
    77  	return issues, nil
    78  }
    79  
    80  // RegisterFlags registers options for this munger; returns any that require a restart when changed.
    81  func (f *TriageFiler) RegisterFlags() {
    82  	flag.IntVar(&f.topClustersCount, "triage-count", 3, "The number of clusters to sync issues for on github.")
    83  	flag.IntVar(&f.windowDays, "triage-window", 1, "The size of the sliding time window (in days) that is used to determine which failures to consider.")
    84  }
    85  
    86  // triageData is a struct that represents the format of the JSON triage data and is used for parsing.
    87  type triageData struct {
    88  	Builds struct {
    89  		Cols struct {
    90  			Elapsed     []int    `json:"elapsed"`
    91  			Executor    []string `json:"executor"`
    92  			PR          []string `json:"pr"`
    93  			Result      []string `json:"result"`
    94  			Started     []int64  `json:"started"`
    95  			TestsFailed []int    `json:"tests_failed"`
    96  			TestsRun    []int    `json:"tests_run"`
    97  		} `json:"cols"`
    98  		JobsRaw  map[string]interface{} `json:"jobs"` // []int or map[string]int
    99  		Jobs     map[string]BuildIndexer
   100  		JobPaths map[string]string `json:"job_paths"`
   101  	} `json:"builds"`
   102  	Clustered []*Cluster `json:"clustered"`
   103  }
   104  
   105  // Cluster holds information about a failure cluster.
   106  type Cluster struct {
   107  	Identifier string  `json:"id"`
   108  	Key        string  `json:"key"`
   109  	Text       string  `json:"text"`
   110  	Tests      []*Test `json:"tests"`
   111  
   112  	filer       *TriageFiler
   113  	jobs        map[string][]int
   114  	totalBuilds int
   115  	totalJobs   int
   116  	totalTests  int
   117  }
   118  
   119  // Test holds a name and list of jobs
   120  type Test struct {
   121  	Name string `json:"name"`
   122  	Jobs []*Job `json:"jobs"`
   123  }
   124  
   125  // Job holds a name and list of build numbers
   126  type Job struct {
   127  	Name   string `json:"name"`
   128  	Builds []int  `json:"builds"`
   129  }
   130  
   131  // filterAndValidate removes failure data that falls outside the time window and ensures that cluster
   132  // data is well formed. It also removes data for PR jobs so that only post-submit failures are considered.
   133  func (f *TriageFiler) filterAndValidate(windowDays int) error {
   134  	f.latestStart = int64(0)
   135  	for _, start := range f.data.Builds.Cols.Started {
   136  		if start > f.latestStart {
   137  			f.latestStart = start
   138  		}
   139  	}
   140  	cutoffTime := time.Unix(f.latestStart, 0).AddDate(0, 0, -windowDays).Unix()
   141  
   142  	validClusts := []*Cluster{}
   143  	for clustIndex, clust := range f.data.Clustered {
   144  		if len(clust.Identifier) == 0 {
   145  			return fmt.Errorf("the cluster at index %d in the triage JSON data does not specify an ID", clustIndex)
   146  		}
   147  		if clust.Tests == nil {
   148  			return fmt.Errorf("cluster '%s' does not have a 'tests' key", clust.Identifier)
   149  		}
   150  		validTests := []*Test{}
   151  		for _, test := range clust.Tests {
   152  			if len(test.Name) == 0 {
   153  				return fmt.Errorf("cluster '%s' contains a test without a name", clust.Identifier)
   154  			}
   155  			if test.Jobs == nil {
   156  				return fmt.Errorf("cluster '%s' does not have a 'jobs' key", clust.Identifier)
   157  			}
   158  			validJobs := []*Job{}
   159  			for _, job := range test.Jobs {
   160  				if len(job.Name) == 0 {
   161  					return fmt.Errorf("cluster '%s' contains a job without a name under test '%s'", clust.Identifier, test.Name)
   162  				}
   163  				// Filter out PR jobs
   164  				if strings.HasPrefix(job.Name, "pr:") {
   165  					continue
   166  				}
   167  				if len(job.Builds) == 0 {
   168  					return fmt.Errorf("cluster '%s' contains job '%s' under test '%s' with no failing builds", clust.Identifier, job.Name, test.Name)
   169  				}
   170  				validBuilds := []int{}
   171  				rowMap, ok := f.data.Builds.Jobs[job.Name]
   172  				if !ok {
   173  					return fmt.Errorf("triage json data does not contain buildnum to row index mapping for job '%s'", job.Name)
   174  				}
   175  				for _, buildnum := range job.Builds {
   176  					row, err := rowMap.rowForBuild(buildnum)
   177  					if err != nil {
   178  						return err
   179  					}
   180  					if f.data.Builds.Cols.Started[row] > cutoffTime {
   181  						validBuilds = append(validBuilds, buildnum)
   182  					}
   183  				}
   184  				if len(validBuilds) > 0 {
   185  					job.Builds = validBuilds
   186  					validJobs = append(validJobs, job)
   187  				}
   188  			}
   189  			if len(validJobs) > 0 {
   190  				test.Jobs = validJobs
   191  				validTests = append(validTests, test)
   192  			}
   193  		}
   194  		if len(validTests) > 0 {
   195  			clust.Tests = validTests
   196  			validClusts = append(validClusts, clust)
   197  		}
   198  	}
   199  	f.data.Clustered = validClusts
   200  	return nil
   201  }
   202  
   203  // BuildIndexer is an interface that describes the buildnum to row index mapping used to retrieve data
   204  // about individual builds from the JSON file.
   205  // This is an interface because the JSON format describing failure clusters has 2 ways of recording the mapping info.
   206  type BuildIndexer interface {
   207  	rowForBuild(buildnum int) (int, error)
   208  }
   209  
   210  // ContigIndexer is a BuildIndexer implementation for when the buildnum to row index mapping describes
   211  // a contiguous set of rows via 3 ints.
   212  type ContigIndexer struct {
   213  	startRow, startBuild, count int
   214  }
   215  
   216  func (rowMap ContigIndexer) rowForBuild(buildnum int) (int, error) {
   217  	if buildnum < rowMap.startBuild || buildnum > rowMap.startBuild+rowMap.count-1 {
   218  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum)
   219  	}
   220  	return buildnum - rowMap.startBuild + rowMap.startRow, nil
   221  }
   222  
   223  // DictIndexer is a BuildIndexer implementation for when the buildnum to row index mapping is simply a dictionary.
   224  // The value type of this dictionary is interface instead of int so that we don't have to convert the original map.
   225  type DictIndexer map[string]interface{}
   226  
   227  func (rowMap DictIndexer) rowForBuild(buildnum int) (int, error) {
   228  	row, ok := rowMap[strconv.Itoa(buildnum)]
   229  	if !ok {
   230  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum)
   231  	}
   232  	var irow float64
   233  	if irow, ok = row.(float64); !ok {
   234  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping contains invalid type", buildnum)
   235  	}
   236  	return int(irow), nil
   237  }
   238  
   239  // loadClusters parses and filters the json data, then populates every Cluster struct with
   240  // aggregated job data and totals. The job data specifies all jobs that failed in a cluster and the
   241  // builds that failed for each job, independent of which tests the jobs or builds failed.
   242  func (f *TriageFiler) loadClusters(jsonIn []byte) ([]*Cluster, error) {
   243  	var err error
   244  	f.data, err = parseTriageData(jsonIn)
   245  	if err != nil {
   246  		return nil, err
   247  	}
   248  	if err = f.filterAndValidate(f.windowDays); err != nil {
   249  		return nil, err
   250  	}
   251  
   252  	// Aggregate failing builds in each cluster by job (independent of tests).
   253  	for _, clust := range f.data.Clustered {
   254  		clust.filer = f
   255  		clust.jobs = make(map[string][]int)
   256  
   257  		for _, test := range clust.Tests {
   258  			for _, job := range test.Jobs {
   259  				for _, buildnum := range job.Builds {
   260  					found := false
   261  					for _, oldBuild := range clust.jobs[job.Name] {
   262  						if oldBuild == buildnum {
   263  							found = true
   264  							break
   265  						}
   266  					}
   267  					if !found {
   268  						clust.jobs[job.Name] = append(clust.jobs[job.Name], buildnum)
   269  					}
   270  				}
   271  			}
   272  		}
   273  		clust.totalJobs = len(clust.jobs)
   274  		clust.totalTests = len(clust.Tests)
   275  		clust.totalBuilds = 0
   276  		for _, builds := range clust.jobs {
   277  			clust.totalBuilds += len(builds)
   278  		}
   279  	}
   280  	return f.data.Clustered, nil
   281  }
   282  
   283  // parseTriageData unmarshals raw json data into a triageData struct and creates a BuildIndexer for
   284  // every job.
   285  func parseTriageData(jsonIn []byte) (*triageData, error) {
   286  	var data triageData
   287  	if err := json.Unmarshal(jsonIn, &data); err != nil {
   288  		return nil, err
   289  	}
   290  
   291  	if data.Builds.Cols.Started == nil {
   292  		return nil, fmt.Errorf("triage data json is missing the builds.cols.started key")
   293  	}
   294  	if data.Builds.JobsRaw == nil {
   295  		return nil, fmt.Errorf("triage data is missing the builds.jobs key")
   296  	}
   297  	if data.Builds.JobPaths == nil {
   298  		return nil, fmt.Errorf("triage data is missing the builds.job_paths key")
   299  	}
   300  	if data.Clustered == nil {
   301  		return nil, fmt.Errorf("triage data is missing the clustered key")
   302  	}
   303  	// Populate 'Jobs' with the BuildIndexer for each job.
   304  	data.Builds.Jobs = make(map[string]BuildIndexer)
   305  	for jobID, mapper := range data.Builds.JobsRaw {
   306  		switch mapper := mapper.(type) {
   307  		case []interface{}:
   308  			// In this case mapper is a 3 member array. 0:first buildnum, 1:number of builds, 2:start index.
   309  			data.Builds.Jobs[jobID] = ContigIndexer{
   310  				startBuild: int(mapper[0].(float64)),
   311  				count:      int(mapper[1].(float64)),
   312  				startRow:   int(mapper[2].(float64)),
   313  			}
   314  		case map[string]interface{}:
   315  			// In this case mapper is a dictionary.
   316  			data.Builds.Jobs[jobID] = DictIndexer(mapper)
   317  		default:
   318  			return nil, fmt.Errorf("the build number to row index mapping for job '%s' is not an accepted type. Type is: %v", jobID, reflect.TypeOf(mapper))
   319  		}
   320  	}
   321  	return &data, nil
   322  }
   323  
   324  // topClusters gets the 'count' most important clusters from a slice of clusters based on number of build failures.
   325  func topClusters(clusters []*Cluster, count int) []*Cluster {
   326  	less := func(i, j int) bool { return clusters[i].totalBuilds > clusters[j].totalBuilds }
   327  	sort.SliceStable(clusters, less)
   328  
   329  	if len(clusters) < count {
   330  		count = len(clusters)
   331  	}
   332  	return clusters[0:count]
   333  }
   334  
   335  // topTestsFailing returns the top 'count' test names sorted by number of failing jobs.
   336  func (c *Cluster) topTestsFailed(count int) []*Test {
   337  	less := func(i, j int) bool { return len(c.Tests[i].Jobs) > len(c.Tests[j].Jobs) }
   338  	sort.SliceStable(c.Tests, less)
   339  
   340  	if len(c.Tests) < count {
   341  		count = len(c.Tests)
   342  	}
   343  	return c.Tests[0:count]
   344  }
   345  
   346  // topJobsFailed returns the top 'count' job names sorted by number of failing builds.
   347  func (c *Cluster) topJobsFailed(count int) []*Job {
   348  	slice := make([]*Job, len(c.jobs))
   349  	i := 0
   350  	for jobName, builds := range c.jobs {
   351  		slice[i] = &Job{Name: jobName, Builds: builds}
   352  		i++
   353  	}
   354  	less := func(i, j int) bool { return len(slice[i].Builds) > len(slice[j].Builds) }
   355  	sort.SliceStable(slice, less)
   356  
   357  	if len(slice) < count {
   358  		count = len(slice)
   359  	}
   360  	return slice[0:count]
   361  }
   362  
   363  // Title is the string to use as the github issue title.
   364  func (c *Cluster) Title() string {
   365  	return fmt.Sprintf("Failure cluster [%s...] failed %d builds, %d jobs, and %d tests over %d days",
   366  		c.Identifier[0:6],
   367  		c.totalBuilds,
   368  		c.totalJobs,
   369  		c.totalTests,
   370  		c.filer.windowDays,
   371  	)
   372  }
   373  
   374  // Body returns the body text of the github issue and *must* contain the output of ID().
   375  // closedIssues is a (potentially empty) slice containing all closed issues authored by this bot
   376  // that contain ID() in their body.
   377  // If Body returns an empty string no issue is created.
   378  func (c *Cluster) Body(closedIssues []*githubapi.Issue) string {
   379  	// First check that the most recently closed issue (if any exist) was closed
   380  	// before the start of the sliding window.
   381  	cutoffTime := time.Unix(c.filer.latestStart, 0).AddDate(0, 0, -c.filer.windowDays)
   382  	for _, closed := range closedIssues {
   383  		if closed.ClosedAt.After(cutoffTime) {
   384  			return ""
   385  		}
   386  	}
   387  
   388  	var buf bytes.Buffer
   389  	fmt.Fprintf(&buf, "### Failure cluster [%s](%s#%s)\n", c.ID(), triageURL, c.Identifier)
   390  	fmt.Fprintf(&buf, "##### Error text:\n```\n%s\n```\n", c.Text)
   391  	// cluster stats
   392  	fmt.Fprint(&buf, "##### Failure cluster statistics:\n")
   393  	fmt.Fprintf(&buf, "%d tests failed,    %d jobs failed,    %d builds failed.\n", c.totalTests, c.totalJobs, c.totalBuilds)
   394  	fmt.Fprintf(&buf, "Failure stats cover %d day time range '%s' to '%s'.\n##### Top failed tests by jobs failed:\n",
   395  		c.filer.windowDays,
   396  		cutoffTime.Format(timeFormat),
   397  		time.Unix(c.filer.latestStart, 0).Format(timeFormat))
   398  	// top tests failed
   399  	fmt.Fprint(&buf, "\n| Test Name | Jobs Failed |\n| --- | --- |\n")
   400  	for _, test := range c.topTestsFailed(topTestsCount) {
   401  		fmt.Fprintf(&buf, "| %s | %d |\n", test.Name, len(test.Jobs))
   402  	}
   403  	// top jobs failed
   404  	fmt.Fprint(&buf, "\n##### Top failed jobs by builds failed:\n")
   405  	fmt.Fprint(&buf, "\n| Job Name | Builds Failed | Latest Failure |\n| --- | --- | --- |\n")
   406  	for _, job := range c.topJobsFailed(topJobsCount) {
   407  		latest := 0
   408  		latestTime := int64(0)
   409  		rowMap := c.filer.data.Builds.Jobs[job.Name]
   410  		for _, build := range job.Builds {
   411  			row, _ := rowMap.rowForBuild(build) // Already validated start time lookup for all builds.
   412  			buildTime := c.filer.data.Builds.Cols.Started[row]
   413  			if buildTime > latestTime {
   414  				latestTime = buildTime
   415  				latest = build
   416  			}
   417  		}
   418  		path := strings.TrimPrefix(c.filer.data.Builds.JobPaths[job.Name], "gs://")
   419  		fmt.Fprintf(&buf, "| %s | %d | [%s](https://gubernator.k8s.io/build/%s/%d) |\n", job.Name, len(job.Builds), time.Unix(latestTime, 0).Format(timeFormat), path, latest)
   420  	}
   421  	// previously closed issues if there are any
   422  	if len(closedIssues) > 0 {
   423  		fmt.Fprint(&buf, "\n##### Previously closed issues for this cluster:\n")
   424  		for _, closed := range closedIssues {
   425  			fmt.Fprintf(&buf, "#%d ", *closed.Number)
   426  		}
   427  		fmt.Fprint(&buf, "\n")
   428  	}
   429  
   430  	// Create /assign command.
   431  	testNames := make([]string, 0, len(c.Tests))
   432  	for _, test := range c.topTestsFailed(len(c.Tests)) {
   433  		testNames = append(testNames, test.Name)
   434  	}
   435  	ownersMap := c.filer.creator.TestsOwners(testNames)
   436  	if len(ownersMap) > 0 {
   437  		fmt.Fprint(&buf, "\n/assign")
   438  		for user := range ownersMap {
   439  			fmt.Fprintf(&buf, " @%s", user)
   440  		}
   441  		fmt.Fprint(&buf, "\n")
   442  	}
   443  
   444  	// Explanations of assignees and sigs
   445  	fmt.Fprint(&buf, c.filer.creator.ExplainTestAssignments(testNames))
   446  
   447  	fmt.Fprintf(&buf, "\n[Current Status](%s#%s)", triageURL, c.Identifier)
   448  
   449  	return buf.String()
   450  }
   451  
   452  // ID yields the string identifier that uniquely identifies this issue.
   453  // This ID must appear in the body of the issue.
   454  // DO NOT CHANGE how this ID is formatted or duplicate issues may be created on github.
   455  func (c *Cluster) ID() string {
   456  	return c.Identifier
   457  }
   458  
   459  // Labels returns the labels to apply to the issue created for this cluster on github.
   460  func (c *Cluster) Labels() []string {
   461  	labels := []string{"kind/flake"}
   462  
   463  	topTests := make([]string, len(c.Tests))
   464  	for i, test := range c.topTestsFailed(len(c.Tests)) {
   465  		topTests[i] = test.Name
   466  	}
   467  	for sig := range c.filer.creator.TestsSIGs(topTests) {
   468  		labels = append(labels, "sig/"+sig)
   469  	}
   470  
   471  	return labels
   472  }
   473  
   474  // Owners returns the list of usernames to assign to this issue on github.
   475  func (c *Cluster) Owners() []string {
   476  	// Assign owners by including a /assign command in the body instead of using Owners to set
   477  	// assignees on the issue request. This lets prow do the assignee validation and will mention
   478  	// the user we want to assign even if they can't be assigned.
   479  	return nil
   480  }
   481  
   482  // Priority calculates and returns the priority of this issue.
   483  // The returned bool indicates if the returned priority is valid and can be used.
   484  func (c *Cluster) Priority() (string, bool) {
   485  	// TODO implement priority calcs later.
   486  	return "", false
   487  }