github.com/abayer/test-infra@v0.0.5/robots/issue-creator/sources/triage-filer.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package sources
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/json"
    22  	"flag"
    23  	"fmt"
    24  	"reflect"
    25  	"sort"
    26  	"strconv"
    27  	"strings"
    28  	"time"
    29  
    30  	githubapi "github.com/google/go-github/github"
    31  	"k8s.io/test-infra/mungegithub/mungers/mungerutil"
    32  	"k8s.io/test-infra/robots/issue-creator/creator"
    33  )
    34  
    35  const (
    36  	timeFormat = "2 Jan 2006 15:04 MST"
    37  
    38  	// Configuration constants.
    39  	topJobsCount   = 3
    40  	topTestsCount  = 3
    41  	triageURL      = "https://go.k8s.io/triage"
    42  	clusterDataURL = "https://storage.googleapis.com/k8s-gubernator/triage/failure_data.json"
    43  )
    44  
    45  // TriageFiler files issues for clustered test failures.
    46  type TriageFiler struct {
    47  	topClustersCount int
    48  	windowDays       int
    49  
    50  	nextSync    time.Time
    51  	latestStart int64
    52  
    53  	creator *creator.IssueCreator
    54  	data    *triageData
    55  }
    56  
    57  func init() {
    58  	creator.RegisterSourceOrDie("triage-filer", &TriageFiler{})
    59  }
    60  
    61  // Issues is the main work function of the TriageFiler.  It fetches and parses cluster data,
    62  // then syncs the top issues to github with the IssueCreator.
    63  func (f *TriageFiler) Issues(c *creator.IssueCreator) ([]creator.Issue, error) {
    64  	f.creator = c
    65  	rawjson, err := mungerutil.ReadHTTP(clusterDataURL)
    66  	if err != nil {
    67  		return nil, err
    68  	}
    69  	clusters, err := f.loadClusters(rawjson)
    70  	if err != nil {
    71  		return nil, err
    72  	}
    73  	topclusters := topClusters(clusters, f.topClustersCount)
    74  	issues := make([]creator.Issue, 0, len(topclusters))
    75  	for _, clust := range topclusters {
    76  		issues = append(issues, clust)
    77  	}
    78  	return issues, nil
    79  }
    80  
    81  // RegisterFlags registers options for this munger; returns any that require a restart when changed.
    82  func (f *TriageFiler) RegisterFlags() {
    83  	flag.IntVar(&f.topClustersCount, "triage-count", 3, "The number of clusters to sync issues for on github.")
    84  	flag.IntVar(&f.windowDays, "triage-window", 1, "The size of the sliding time window (in days) that is used to determine which failures to consider.")
    85  }
    86  
    87  // triageData is a struct that represents the format of the JSON triage data and is used for parsing.
    88  type triageData struct {
    89  	Builds struct {
    90  		Cols struct {
    91  			Elapsed     []int    `json:"elapsed"`
    92  			Executor    []string `json:"executor"`
    93  			PR          []string `json:"pr"`
    94  			Result      []string `json:"result"`
    95  			Started     []int64  `json:"started"`
    96  			TestsFailed []int    `json:"tests_failed"`
    97  			TestsRun    []int    `json:"tests_run"`
    98  		} `json:"cols"`
    99  		JobsRaw  map[string]interface{} `json:"jobs"` // []int or map[string]int
   100  		Jobs     map[string]BuildIndexer
   101  		JobPaths map[string]string `json:"job_paths"`
   102  	} `json:"builds"`
   103  	Clustered []*Cluster `json:"clustered"`
   104  }
   105  
   106  // Cluster holds information about a failure cluster.
   107  type Cluster struct {
   108  	Identifier string  `json:"id"`
   109  	Key        string  `json:"key"`
   110  	Text       string  `json:"text"`
   111  	Tests      []*Test `json:"tests"`
   112  
   113  	filer       *TriageFiler
   114  	jobs        map[string][]int
   115  	totalBuilds int
   116  	totalJobs   int
   117  	totalTests  int
   118  }
   119  
   120  // Test holds a name and list of jobs
   121  type Test struct {
   122  	Name string `json:"name"`
   123  	Jobs []*Job `json:"jobs"`
   124  }
   125  
   126  // Job holds a name and list of build numbers
   127  type Job struct {
   128  	Name   string `json:"name"`
   129  	Builds []int  `json:"builds"`
   130  }
   131  
   132  // filterAndValidate removes failure data that falls outside the time window and ensures that cluster
   133  // data is well formed. It also removes data for PR jobs so that only post-submit failures are considered.
   134  func (f *TriageFiler) filterAndValidate(windowDays int) error {
   135  	f.latestStart = int64(0)
   136  	for _, start := range f.data.Builds.Cols.Started {
   137  		if start > f.latestStart {
   138  			f.latestStart = start
   139  		}
   140  	}
   141  	cutoffTime := time.Unix(f.latestStart, 0).AddDate(0, 0, -windowDays).Unix()
   142  
   143  	validClusts := []*Cluster{}
   144  	for clustIndex, clust := range f.data.Clustered {
   145  		if len(clust.Identifier) == 0 {
   146  			return fmt.Errorf("the cluster at index %d in the triage JSON data does not specify an ID", clustIndex)
   147  		}
   148  		if clust.Tests == nil {
   149  			return fmt.Errorf("cluster '%s' does not have a 'tests' key", clust.Identifier)
   150  		}
   151  		validTests := []*Test{}
   152  		for _, test := range clust.Tests {
   153  			if len(test.Name) == 0 {
   154  				return fmt.Errorf("cluster '%s' contains a test without a name", clust.Identifier)
   155  			}
   156  			if test.Jobs == nil {
   157  				return fmt.Errorf("cluster '%s' does not have a 'jobs' key", clust.Identifier)
   158  			}
   159  			validJobs := []*Job{}
   160  			for _, job := range test.Jobs {
   161  				if len(job.Name) == 0 {
   162  					return fmt.Errorf("cluster '%s' contains a job without a name under test '%s'", clust.Identifier, test.Name)
   163  				}
   164  				// Filter out PR jobs
   165  				if strings.HasPrefix(job.Name, "pr:") {
   166  					continue
   167  				}
   168  				if len(job.Builds) == 0 {
   169  					return fmt.Errorf("cluster '%s' contains job '%s' under test '%s' with no failing builds", clust.Identifier, job.Name, test.Name)
   170  				}
   171  				validBuilds := []int{}
   172  				rowMap, ok := f.data.Builds.Jobs[job.Name]
   173  				if !ok {
   174  					return fmt.Errorf("triage json data does not contain buildnum to row index mapping for job '%s'", job.Name)
   175  				}
   176  				for _, buildnum := range job.Builds {
   177  					row, err := rowMap.rowForBuild(buildnum)
   178  					if err != nil {
   179  						return err
   180  					}
   181  					if f.data.Builds.Cols.Started[row] > cutoffTime {
   182  						validBuilds = append(validBuilds, buildnum)
   183  					}
   184  				}
   185  				if len(validBuilds) > 0 {
   186  					job.Builds = validBuilds
   187  					validJobs = append(validJobs, job)
   188  				}
   189  			}
   190  			if len(validJobs) > 0 {
   191  				test.Jobs = validJobs
   192  				validTests = append(validTests, test)
   193  			}
   194  		}
   195  		if len(validTests) > 0 {
   196  			clust.Tests = validTests
   197  			validClusts = append(validClusts, clust)
   198  		}
   199  	}
   200  	f.data.Clustered = validClusts
   201  	return nil
   202  }
   203  
   204  // BuildIndexer is an interface that describes the buildnum to row index mapping used to retrieve data
   205  // about individual builds from the JSON file.
   206  // This is an interface because the JSON format describing failure clusters has 2 ways of recording the mapping info.
   207  type BuildIndexer interface {
   208  	rowForBuild(buildnum int) (int, error)
   209  }
   210  
   211  // ContigIndexer is a BuildIndexer implementation for when the buildnum to row index mapping describes
   212  // a contiguous set of rows via 3 ints.
   213  type ContigIndexer struct {
   214  	startRow, startBuild, count int
   215  }
   216  
   217  func (rowMap ContigIndexer) rowForBuild(buildnum int) (int, error) {
   218  	if buildnum < rowMap.startBuild || buildnum > rowMap.startBuild+rowMap.count-1 {
   219  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum)
   220  	}
   221  	return buildnum - rowMap.startBuild + rowMap.startRow, nil
   222  }
   223  
   224  // DictIndexer is a BuildIndexer implementation for when the buildnum to row index mapping is simply a dictionary.
   225  // The value type of this dictionary is interface instead of int so that we don't have to convert the original map.
   226  type DictIndexer map[string]interface{}
   227  
   228  func (rowMap DictIndexer) rowForBuild(buildnum int) (int, error) {
   229  	row, ok := rowMap[strconv.Itoa(buildnum)]
   230  	if !ok {
   231  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum)
   232  	}
   233  	var irow float64
   234  	if irow, ok = row.(float64); !ok {
   235  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping contains invalid type", buildnum)
   236  	}
   237  	return int(irow), nil
   238  }
   239  
   240  // loadClusters parses and filters the json data, then populates every Cluster struct with
   241  // aggregated job data and totals. The job data specifies all jobs that failed in a cluster and the
   242  // builds that failed for each job, independent of which tests the jobs or builds failed.
   243  func (f *TriageFiler) loadClusters(jsonIn []byte) ([]*Cluster, error) {
   244  	var err error
   245  	f.data, err = parseTriageData(jsonIn)
   246  	if err != nil {
   247  		return nil, err
   248  	}
   249  	if err = f.filterAndValidate(f.windowDays); err != nil {
   250  		return nil, err
   251  	}
   252  
   253  	// Aggregate failing builds in each cluster by job (independent of tests).
   254  	for _, clust := range f.data.Clustered {
   255  		clust.filer = f
   256  		clust.jobs = make(map[string][]int)
   257  
   258  		for _, test := range clust.Tests {
   259  			for _, job := range test.Jobs {
   260  				for _, buildnum := range job.Builds {
   261  					found := false
   262  					for _, oldBuild := range clust.jobs[job.Name] {
   263  						if oldBuild == buildnum {
   264  							found = true
   265  							break
   266  						}
   267  					}
   268  					if !found {
   269  						clust.jobs[job.Name] = append(clust.jobs[job.Name], buildnum)
   270  					}
   271  				}
   272  			}
   273  		}
   274  		clust.totalJobs = len(clust.jobs)
   275  		clust.totalTests = len(clust.Tests)
   276  		clust.totalBuilds = 0
   277  		for _, builds := range clust.jobs {
   278  			clust.totalBuilds += len(builds)
   279  		}
   280  	}
   281  	return f.data.Clustered, nil
   282  }
   283  
   284  // parseTriageData unmarshals raw json data into a triageData struct and creates a BuildIndexer for
   285  // every job.
   286  func parseTriageData(jsonIn []byte) (*triageData, error) {
   287  	var data triageData
   288  	if err := json.Unmarshal(jsonIn, &data); err != nil {
   289  		return nil, err
   290  	}
   291  
   292  	if data.Builds.Cols.Started == nil {
   293  		return nil, fmt.Errorf("triage data json is missing the builds.cols.started key")
   294  	}
   295  	if data.Builds.JobsRaw == nil {
   296  		return nil, fmt.Errorf("triage data is missing the builds.jobs key")
   297  	}
   298  	if data.Builds.JobPaths == nil {
   299  		return nil, fmt.Errorf("triage data is missing the builds.job_paths key")
   300  	}
   301  	if data.Clustered == nil {
   302  		return nil, fmt.Errorf("triage data is missing the clustered key")
   303  	}
   304  	// Populate 'Jobs' with the BuildIndexer for each job.
   305  	data.Builds.Jobs = make(map[string]BuildIndexer)
   306  	for jobID, mapper := range data.Builds.JobsRaw {
   307  		switch mapper := mapper.(type) {
   308  		case []interface{}:
   309  			// In this case mapper is a 3 member array. 0:first buildnum, 1:number of builds, 2:start index.
   310  			data.Builds.Jobs[jobID] = ContigIndexer{
   311  				startBuild: int(mapper[0].(float64)),
   312  				count:      int(mapper[1].(float64)),
   313  				startRow:   int(mapper[2].(float64)),
   314  			}
   315  		case map[string]interface{}:
   316  			// In this case mapper is a dictionary.
   317  			data.Builds.Jobs[jobID] = DictIndexer(mapper)
   318  		default:
   319  			return nil, fmt.Errorf("the build number to row index mapping for job '%s' is not an accepted type. Type is: %v", jobID, reflect.TypeOf(mapper))
   320  		}
   321  	}
   322  	return &data, nil
   323  }
   324  
   325  // topClusters gets the 'count' most important clusters from a slice of clusters based on number of build failures.
   326  func topClusters(clusters []*Cluster, count int) []*Cluster {
   327  	less := func(i, j int) bool { return clusters[i].totalBuilds > clusters[j].totalBuilds }
   328  	sort.SliceStable(clusters, less)
   329  
   330  	if len(clusters) < count {
   331  		count = len(clusters)
   332  	}
   333  	return clusters[0:count]
   334  }
   335  
   336  // topTestsFailing returns the top 'count' test names sorted by number of failing jobs.
   337  func (c *Cluster) topTestsFailed(count int) []*Test {
   338  	less := func(i, j int) bool { return len(c.Tests[i].Jobs) > len(c.Tests[j].Jobs) }
   339  	sort.SliceStable(c.Tests, less)
   340  
   341  	if len(c.Tests) < count {
   342  		count = len(c.Tests)
   343  	}
   344  	return c.Tests[0:count]
   345  }
   346  
   347  // topJobsFailed returns the top 'count' job names sorted by number of failing builds.
   348  func (c *Cluster) topJobsFailed(count int) []*Job {
   349  	slice := make([]*Job, len(c.jobs))
   350  	i := 0
   351  	for jobName, builds := range c.jobs {
   352  		slice[i] = &Job{Name: jobName, Builds: builds}
   353  		i++
   354  	}
   355  	less := func(i, j int) bool { return len(slice[i].Builds) > len(slice[j].Builds) }
   356  	sort.SliceStable(slice, less)
   357  
   358  	if len(slice) < count {
   359  		count = len(slice)
   360  	}
   361  	return slice[0:count]
   362  }
   363  
   364  // Title is the string to use as the github issue title.
   365  func (c *Cluster) Title() string {
   366  	return fmt.Sprintf("Failure cluster [%s...] failed %d builds, %d jobs, and %d tests over %d days",
   367  		c.Identifier[0:6],
   368  		c.totalBuilds,
   369  		c.totalJobs,
   370  		c.totalTests,
   371  		c.filer.windowDays,
   372  	)
   373  }
   374  
   375  // Body returns the body text of the github issue and *must* contain the output of ID().
   376  // closedIssues is a (potentially empty) slice containing all closed issues authored by this bot
   377  // that contain ID() in their body.
   378  // If Body returns an empty string no issue is created.
   379  func (c *Cluster) Body(closedIssues []*githubapi.Issue) string {
   380  	// First check that the most recently closed issue (if any exist) was closed
   381  	// before the start of the sliding window.
   382  	cutoffTime := time.Unix(c.filer.latestStart, 0).AddDate(0, 0, -c.filer.windowDays)
   383  	for _, closed := range closedIssues {
   384  		if closed.ClosedAt.After(cutoffTime) {
   385  			return ""
   386  		}
   387  	}
   388  
   389  	var buf bytes.Buffer
   390  	fmt.Fprintf(&buf, "### Failure cluster [%s](%s#%s)\n", c.ID(), triageURL, c.Identifier)
   391  	fmt.Fprintf(&buf, "##### Error text:\n```\n%s\n```\n", c.Text)
   392  	// cluster stats
   393  	fmt.Fprint(&buf, "##### Failure cluster statistics:\n")
   394  	fmt.Fprintf(&buf, "%d tests failed,    %d jobs failed,    %d builds failed.\n", c.totalTests, c.totalJobs, c.totalBuilds)
   395  	fmt.Fprintf(&buf, "Failure stats cover %d day time range '%s' to '%s'.\n##### Top failed tests by jobs failed:\n",
   396  		c.filer.windowDays,
   397  		cutoffTime.Format(timeFormat),
   398  		time.Unix(c.filer.latestStart, 0).Format(timeFormat))
   399  	// top tests failed
   400  	fmt.Fprint(&buf, "\n| Test Name | Jobs Failed |\n| --- | --- |\n")
   401  	for _, test := range c.topTestsFailed(topTestsCount) {
   402  		fmt.Fprintf(&buf, "| %s | %d |\n", test.Name, len(test.Jobs))
   403  	}
   404  	// top jobs failed
   405  	fmt.Fprint(&buf, "\n##### Top failed jobs by builds failed:\n")
   406  	fmt.Fprint(&buf, "\n| Job Name | Builds Failed | Latest Failure |\n| --- | --- | --- |\n")
   407  	for _, job := range c.topJobsFailed(topJobsCount) {
   408  		latest := 0
   409  		latestTime := int64(0)
   410  		rowMap := c.filer.data.Builds.Jobs[job.Name]
   411  		for _, build := range job.Builds {
   412  			row, _ := rowMap.rowForBuild(build) // Already validated start time lookup for all builds.
   413  			buildTime := c.filer.data.Builds.Cols.Started[row]
   414  			if buildTime > latestTime {
   415  				latestTime = buildTime
   416  				latest = build
   417  			}
   418  		}
   419  		path := strings.TrimPrefix(c.filer.data.Builds.JobPaths[job.Name], "gs://")
   420  		fmt.Fprintf(&buf, "| %s | %d | [%s](https://k8s-gubernator.appspot.com/build/%s/%d) |\n", job.Name, len(job.Builds), time.Unix(latestTime, 0).Format(timeFormat), path, latest)
   421  	}
   422  	// previously closed issues if there are any
   423  	if len(closedIssues) > 0 {
   424  		fmt.Fprint(&buf, "\n##### Previously closed issues for this cluster:\n")
   425  		for _, closed := range closedIssues {
   426  			fmt.Fprintf(&buf, "#%d ", *closed.Number)
   427  		}
   428  		fmt.Fprint(&buf, "\n")
   429  	}
   430  
   431  	// Create /assign command.
   432  	testNames := make([]string, 0, len(c.Tests))
   433  	for _, test := range c.topTestsFailed(len(c.Tests)) {
   434  		testNames = append(testNames, test.Name)
   435  	}
   436  	ownersMap := c.filer.creator.TestsOwners(testNames)
   437  	if len(ownersMap) > 0 {
   438  		fmt.Fprint(&buf, "\n/assign")
   439  		for user := range ownersMap {
   440  			fmt.Fprintf(&buf, " @%s", user)
   441  		}
   442  		fmt.Fprint(&buf, "\n")
   443  	}
   444  
   445  	// Explanations of assignees and sigs
   446  	fmt.Fprint(&buf, c.filer.creator.ExplainTestAssignments(testNames))
   447  
   448  	fmt.Fprintf(&buf, "\n[Current Status](%s#%s)", triageURL, c.Identifier)
   449  
   450  	return buf.String()
   451  }
   452  
   453  // ID yields the string identifier that uniquely identifies this issue.
   454  // This ID must appear in the body of the issue.
   455  // DO NOT CHANGE how this ID is formatted or duplicate issues may be created on github.
   456  func (c *Cluster) ID() string {
   457  	return c.Identifier
   458  }
   459  
   460  // Labels returns the labels to apply to the issue created for this cluster on github.
   461  func (c *Cluster) Labels() []string {
   462  	labels := []string{"kind/flake"}
   463  
   464  	topTests := make([]string, len(c.Tests))
   465  	for i, test := range c.topTestsFailed(len(c.Tests)) {
   466  		topTests[i] = test.Name
   467  	}
   468  	for sig := range c.filer.creator.TestsSIGs(topTests) {
   469  		labels = append(labels, "sig/"+sig)
   470  	}
   471  
   472  	return labels
   473  }
   474  
   475  // Owners returns the list of usernames to assign to this issue on github.
   476  func (c *Cluster) Owners() []string {
   477  	// Assign owners by including a /assign command in the body instead of using Owners to set
   478  	// assignees on the issue request. This lets prow do the assignee validation and will mention
   479  	// the user we want to assign even if they can't be assigned.
   480  	return nil
   481  }
   482  
   483  // Priority calculates and returns the priority of this issue.
   484  // The returned bool indicates if the returned priority is valid and can be used.
   485  func (c *Cluster) Priority() (string, bool) {
   486  	// TODO implement priority calcs later.
   487  	return "", false
   488  }