k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/robots/issue-creator/sources/triage-filer.go

k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/robots/issue-creator/sources/triage-filer.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package sources
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/json"
    22  	"flag"
    23  	"fmt"
    24  	"reflect"
    25  	"sort"
    26  	"strconv"
    27  	"strings"
    28  	"time"
    29  
    30  	githubapi "github.com/google/go-github/github"
    31  	"k8s.io/test-infra/robots/issue-creator/creator"
    32  )
    33  
    34  const (
    35  	timeFormat = "2 Jan 2006 15:04 MST"
    36  
    37  	// Configuration constants.
    38  	topJobsCount   = 3
    39  	topTestsCount  = 3
    40  	triageURL      = "https://go.k8s.io/triage"
    41  	clusterDataURL = "https://storage.googleapis.com/k8s-gubernator/triage/failure_data.json"
    42  )
    43  
    44  // TriageFiler files issues for clustered test failures.
    45  type TriageFiler struct {
    46  	topClustersCount int
    47  	windowDays       int
    48  
    49  	latestStart int64
    50  
    51  	creator *creator.IssueCreator
    52  	data    *triageData
    53  }
    54  
    55  func init() {
    56  	creator.RegisterSourceOrDie("triage-filer", &TriageFiler{})
    57  }
    58  
    59  // Issues is the main work function of the TriageFiler.  It fetches and parses cluster data,
    60  // then syncs the top issues to github with the IssueCreator.
    61  func (f *TriageFiler) Issues(c *creator.IssueCreator) ([]creator.Issue, error) {
    62  	f.creator = c
    63  	rawjson, err := ReadHTTP(clusterDataURL)
    64  	if err != nil {
    65  		return nil, err
    66  	}
    67  	clusters, err := f.loadClusters(rawjson)
    68  	if err != nil {
    69  		return nil, err
    70  	}
    71  	topclusters := topClusters(clusters, f.topClustersCount)
    72  	issues := make([]creator.Issue, 0, len(topclusters))
    73  	for _, clust := range topclusters {
    74  		issues = append(issues, clust)
    75  	}
    76  	return issues, nil
    77  }
    78  
    79  // RegisterFlags registers options for this munger; returns any that require a restart when changed.
    80  func (f *TriageFiler) RegisterFlags() {
    81  	flag.IntVar(&f.topClustersCount, "triage-count", 3, "The number of clusters to sync issues for on github.")
    82  	flag.IntVar(&f.windowDays, "triage-window", 1, "The size of the sliding time window (in days) that is used to determine which failures to consider.")
    83  }
    84  
    85  // triageData is a struct that represents the format of the JSON triage data and is used for parsing.
    86  type triageData struct {
    87  	Builds struct {
    88  		Cols struct {
    89  			Elapsed     []int    `json:"elapsed"`
    90  			Executor    []string `json:"executor"`
    91  			PR          []string `json:"pr"`
    92  			Result      []string `json:"result"`
    93  			Started     []int64  `json:"started"`
    94  			TestsFailed []int    `json:"tests_failed"`
    95  			TestsRun    []int    `json:"tests_run"`
    96  		} `json:"cols"`
    97  		JobsRaw  map[string]interface{} `json:"jobs"` // []int or map[string]int
    98  		Jobs     map[string]BuildIndexer
    99  		JobPaths map[string]string `json:"job_paths"`
   100  	} `json:"builds"`
   101  	Clustered []*Cluster `json:"clustered"`
   102  }
   103  
   104  // Cluster holds information about a failure cluster.
   105  type Cluster struct {
   106  	Identifier string  `json:"id"`
   107  	Key        string  `json:"key"`
   108  	Text       string  `json:"text"`
   109  	Tests      []*Test `json:"tests"`
   110  
   111  	filer       *TriageFiler
   112  	jobs        map[string][]int
   113  	totalBuilds int
   114  	totalJobs   int
   115  	totalTests  int
   116  }
   117  
   118  // Test holds a name and list of jobs
   119  type Test struct {
   120  	Name string `json:"name"`
   121  	Jobs []*Job `json:"jobs"`
   122  }
   123  
   124  // Job holds a name and list of build numbers
   125  type Job struct {
   126  	Name   string `json:"name"`
   127  	Builds []int  `json:"builds"`
   128  }
   129  
   130  // filterAndValidate removes failure data that falls outside the time window and ensures that cluster
   131  // data is well formed. It also removes data for PR jobs so that only post-submit failures are considered.
   132  func (f *TriageFiler) filterAndValidate(windowDays int) error {
   133  	f.latestStart = int64(0)
   134  	for _, start := range f.data.Builds.Cols.Started {
   135  		if start > f.latestStart {
   136  			f.latestStart = start
   137  		}
   138  	}
   139  	cutoffTime := time.Unix(f.latestStart, 0).AddDate(0, 0, -windowDays).Unix()
   140  
   141  	validClusts := []*Cluster{}
   142  	for clustIndex, clust := range f.data.Clustered {
   143  		if len(clust.Identifier) == 0 {
   144  			return fmt.Errorf("the cluster at index %d in the triage JSON data does not specify an ID", clustIndex)
   145  		}
   146  		if clust.Tests == nil {
   147  			return fmt.Errorf("cluster '%s' does not have a 'tests' key", clust.Identifier)
   148  		}
   149  		validTests := []*Test{}
   150  		for _, test := range clust.Tests {
   151  			if len(test.Name) == 0 {
   152  				return fmt.Errorf("cluster '%s' contains a test without a name", clust.Identifier)
   153  			}
   154  			if test.Jobs == nil {
   155  				return fmt.Errorf("cluster '%s' does not have a 'jobs' key", clust.Identifier)
   156  			}
   157  			validJobs := []*Job{}
   158  			for _, job := range test.Jobs {
   159  				if len(job.Name) == 0 {
   160  					return fmt.Errorf("cluster '%s' contains a job without a name under test '%s'", clust.Identifier, test.Name)
   161  				}
   162  				// Filter out PR jobs
   163  				if strings.HasPrefix(job.Name, "pr:") {
   164  					continue
   165  				}
   166  				if len(job.Builds) == 0 {
   167  					return fmt.Errorf("cluster '%s' contains job '%s' under test '%s' with no failing builds", clust.Identifier, job.Name, test.Name)
   168  				}
   169  				validBuilds := []int{}
   170  				rowMap, ok := f.data.Builds.Jobs[job.Name]
   171  				if !ok {
   172  					return fmt.Errorf("triage json data does not contain buildnum to row index mapping for job '%s'", job.Name)
   173  				}
   174  				for _, buildnum := range job.Builds {
   175  					row, err := rowMap.rowForBuild(buildnum)
   176  					if err != nil {
   177  						return err
   178  					}
   179  					if f.data.Builds.Cols.Started[row] > cutoffTime {
   180  						validBuilds = append(validBuilds, buildnum)
   181  					}
   182  				}
   183  				if len(validBuilds) > 0 {
   184  					job.Builds = validBuilds
   185  					validJobs = append(validJobs, job)
   186  				}
   187  			}
   188  			if len(validJobs) > 0 {
   189  				test.Jobs = validJobs
   190  				validTests = append(validTests, test)
   191  			}
   192  		}
   193  		if len(validTests) > 0 {
   194  			clust.Tests = validTests
   195  			validClusts = append(validClusts, clust)
   196  		}
   197  	}
   198  	f.data.Clustered = validClusts
   199  	return nil
   200  }
   201  
   202  // BuildIndexer is an interface that describes the buildnum to row index mapping used to retrieve data
   203  // about individual builds from the JSON file.
   204  // This is an interface because the JSON format describing failure clusters has 2 ways of recording the mapping info.
   205  type BuildIndexer interface {
   206  	rowForBuild(buildnum int) (int, error)
   207  }
   208  
   209  // ContigIndexer is a BuildIndexer implementation for when the buildnum to row index mapping describes
   210  // a contiguous set of rows via 3 ints.
   211  type ContigIndexer struct {
   212  	startRow, startBuild, count int
   213  }
   214  
   215  func (rowMap ContigIndexer) rowForBuild(buildnum int) (int, error) {
   216  	if buildnum < rowMap.startBuild || buildnum > rowMap.startBuild+rowMap.count-1 {
   217  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum)
   218  	}
   219  	return buildnum - rowMap.startBuild + rowMap.startRow, nil
   220  }
   221  
   222  // DictIndexer is a BuildIndexer implementation for when the buildnum to row index mapping is simply a dictionary.
   223  // The value type of this dictionary is interface instead of int so that we don't have to convert the original map.
   224  type DictIndexer map[string]interface{}
   225  
   226  func (rowMap DictIndexer) rowForBuild(buildnum int) (int, error) {
   227  	row, ok := rowMap[strconv.Itoa(buildnum)]
   228  	if !ok {
   229  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum)
   230  	}
   231  	var irow float64
   232  	if irow, ok = row.(float64); !ok {
   233  		return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping contains invalid type", buildnum)
   234  	}
   235  	return int(irow), nil
   236  }
   237  
   238  // loadClusters parses and filters the json data, then populates every Cluster struct with
   239  // aggregated job data and totals. The job data specifies all jobs that failed in a cluster and the
   240  // builds that failed for each job, independent of which tests the jobs or builds failed.
   241  func (f *TriageFiler) loadClusters(jsonIn []byte) ([]*Cluster, error) {
   242  	var err error
   243  	f.data, err = parseTriageData(jsonIn)
   244  	if err != nil {
   245  		return nil, err
   246  	}
   247  	if err = f.filterAndValidate(f.windowDays); err != nil {
   248  		return nil, err
   249  	}
   250  
   251  	// Aggregate failing builds in each cluster by job (independent of tests).
   252  	for _, clust := range f.data.Clustered {
   253  		clust.filer = f
   254  		clust.jobs = make(map[string][]int)
   255  
   256  		for _, test := range clust.Tests {
   257  			for _, job := range test.Jobs {
   258  				for _, buildnum := range job.Builds {
   259  					found := false
   260  					for _, oldBuild := range clust.jobs[job.Name] {
   261  						if oldBuild == buildnum {
   262  							found = true
   263  							break
   264  						}
   265  					}
   266  					if !found {
   267  						clust.jobs[job.Name] = append(clust.jobs[job.Name], buildnum)
   268  					}
   269  				}
   270  			}
   271  		}
   272  		clust.totalJobs = len(clust.jobs)
   273  		clust.totalTests = len(clust.Tests)
   274  		clust.totalBuilds = 0
   275  		for _, builds := range clust.jobs {
   276  			clust.totalBuilds += len(builds)
   277  		}
   278  	}
   279  	return f.data.Clustered, nil
   280  }
   281  
   282  // parseTriageData unmarshals raw json data into a triageData struct and creates a BuildIndexer for
   283  // every job.
   284  func parseTriageData(jsonIn []byte) (*triageData, error) {
   285  	var data triageData
   286  	if err := json.Unmarshal(jsonIn, &data); err != nil {
   287  		return nil, err
   288  	}
   289  
   290  	if data.Builds.Cols.Started == nil {
   291  		return nil, fmt.Errorf("triage data json is missing the builds.cols.started key")
   292  	}
   293  	if data.Builds.JobsRaw == nil {
   294  		return nil, fmt.Errorf("triage data is missing the builds.jobs key")
   295  	}
   296  	if data.Builds.JobPaths == nil {
   297  		return nil, fmt.Errorf("triage data is missing the builds.job_paths key")
   298  	}
   299  	if data.Clustered == nil {
   300  		return nil, fmt.Errorf("triage data is missing the clustered key")
   301  	}
   302  	// Populate 'Jobs' with the BuildIndexer for each job.
   303  	data.Builds.Jobs = make(map[string]BuildIndexer)
   304  	for jobID, mapper := range data.Builds.JobsRaw {
   305  		switch mapper := mapper.(type) {
   306  		case []interface{}:
   307  			// In this case mapper is a 3 member array. 0:first buildnum, 1:number of builds, 2:start index.
   308  			data.Builds.Jobs[jobID] = ContigIndexer{
   309  				startBuild: int(mapper[0].(float64)),
   310  				count:      int(mapper[1].(float64)),
   311  				startRow:   int(mapper[2].(float64)),
   312  			}
   313  		case map[string]interface{}:
   314  			// In this case mapper is a dictionary.
   315  			data.Builds.Jobs[jobID] = DictIndexer(mapper)
   316  		default:
   317  			return nil, fmt.Errorf("the build number to row index mapping for job '%s' is not an accepted type. Type is: %v", jobID, reflect.TypeOf(mapper))
   318  		}
   319  	}
   320  	return &data, nil
   321  }
   322  
   323  // topClusters gets the 'count' most important clusters from a slice of clusters based on number of build failures.
   324  func topClusters(clusters []*Cluster, count int) []*Cluster {
   325  	less := func(i, j int) bool { return clusters[i].totalBuilds > clusters[j].totalBuilds }
   326  	sort.SliceStable(clusters, less)
   327  
   328  	if len(clusters) < count {
   329  		count = len(clusters)
   330  	}
   331  	return clusters[0:count]
   332  }
   333  
   334  // topTestsFailing returns the top 'count' test names sorted by number of failing jobs.
   335  func (c *Cluster) topTestsFailed(count int) []*Test {
   336  	less := func(i, j int) bool { return len(c.Tests[i].Jobs) > len(c.Tests[j].Jobs) }
   337  	sort.SliceStable(c.Tests, less)
   338  
   339  	if len(c.Tests) < count {
   340  		count = len(c.Tests)
   341  	}
   342  	return c.Tests[0:count]
   343  }
   344  
   345  // topJobsFailed returns the top 'count' job names sorted by number of failing builds.
   346  func (c *Cluster) topJobsFailed(count int) []*Job {
   347  	slice := make([]*Job, len(c.jobs))
   348  	i := 0
   349  	for jobName, builds := range c.jobs {
   350  		slice[i] = &Job{Name: jobName, Builds: builds}
   351  		i++
   352  	}
   353  	less := func(i, j int) bool { return len(slice[i].Builds) > len(slice[j].Builds) }
   354  	sort.SliceStable(slice, less)
   355  
   356  	if len(slice) < count {
   357  		count = len(slice)
   358  	}
   359  	return slice[0:count]
   360  }
   361  
   362  // Title is the string to use as the github issue title.
   363  func (c *Cluster) Title() string {
   364  	return fmt.Sprintf("Failure cluster [%s...] failed %d builds, %d jobs, and %d tests over %d days",
   365  		c.Identifier[0:6],
   366  		c.totalBuilds,
   367  		c.totalJobs,
   368  		c.totalTests,
   369  		c.filer.windowDays,
   370  	)
   371  }
   372  
   373  // Body returns the body text of the github issue and *must* contain the output of ID().
   374  // closedIssues is a (potentially empty) slice containing all closed issues authored by this bot
   375  // that contain ID() in their body.
   376  // If Body returns an empty string no issue is created.
   377  func (c *Cluster) Body(closedIssues []*githubapi.Issue) string {
   378  	// First check that the most recently closed issue (if any exist) was closed
   379  	// before the start of the sliding window.
   380  	cutoffTime := time.Unix(c.filer.latestStart, 0).AddDate(0, 0, -c.filer.windowDays)
   381  	for _, closed := range closedIssues {
   382  		if closed.ClosedAt.After(cutoffTime) {
   383  			return ""
   384  		}
   385  	}
   386  
   387  	var buf bytes.Buffer
   388  	fmt.Fprintf(&buf, "### Failure cluster [%s](%s#%s)\n", c.ID(), triageURL, c.Identifier)
   389  	fmt.Fprintf(&buf, "##### Error text:\n```\n%s\n```\n", c.Text)
   390  	// cluster stats
   391  	fmt.Fprint(&buf, "##### Failure cluster statistics:\n")
   392  	fmt.Fprintf(&buf, "%d tests failed,    %d jobs failed,    %d builds failed.\n", c.totalTests, c.totalJobs, c.totalBuilds)
   393  	fmt.Fprintf(&buf, "Failure stats cover %d day time range '%s' to '%s'.\n##### Top failed tests by jobs failed:\n",
   394  		c.filer.windowDays,
   395  		cutoffTime.Format(timeFormat),
   396  		time.Unix(c.filer.latestStart, 0).Format(timeFormat))
   397  	// top tests failed
   398  	fmt.Fprint(&buf, "\n| Test Name | Jobs Failed |\n| --- | --- |\n")
   399  	for _, test := range c.topTestsFailed(topTestsCount) {
   400  		fmt.Fprintf(&buf, "| %s | %d |\n", test.Name, len(test.Jobs))
   401  	}
   402  	// top jobs failed
   403  	fmt.Fprint(&buf, "\n##### Top failed jobs by builds failed:\n")
   404  	fmt.Fprint(&buf, "\n| Job Name | Builds Failed | Latest Failure |\n| --- | --- | --- |\n")
   405  	for _, job := range c.topJobsFailed(topJobsCount) {
   406  		latest := 0
   407  		latestTime := int64(0)
   408  		rowMap := c.filer.data.Builds.Jobs[job.Name]
   409  		for _, build := range job.Builds {
   410  			row, _ := rowMap.rowForBuild(build) // Already validated start time lookup for all builds.
   411  			buildTime := c.filer.data.Builds.Cols.Started[row]
   412  			if buildTime > latestTime {
   413  				latestTime = buildTime
   414  				latest = build
   415  			}
   416  		}
   417  		path := strings.TrimPrefix(c.filer.data.Builds.JobPaths[job.Name], "gs://")
   418  		fmt.Fprintf(&buf, "| %s | %d | [%s](https://prow.k8s.io/view/gs/%s/%d) |\n", job.Name, len(job.Builds), time.Unix(latestTime, 0).Format(timeFormat), path, latest)
   419  	}
   420  	// previously closed issues if there are any
   421  	if len(closedIssues) > 0 {
   422  		fmt.Fprint(&buf, "\n##### Previously closed issues for this cluster:\n")
   423  		for _, closed := range closedIssues {
   424  			fmt.Fprintf(&buf, "#%d ", *closed.Number)
   425  		}
   426  		fmt.Fprint(&buf, "\n")
   427  	}
   428  
   429  	// Create /assign command.
   430  	testNames := make([]string, 0, len(c.Tests))
   431  	for _, test := range c.topTestsFailed(len(c.Tests)) {
   432  		testNames = append(testNames, test.Name)
   433  	}
   434  	ownersMap := c.filer.creator.TestsOwners(testNames)
   435  	if len(ownersMap) > 0 {
   436  		fmt.Fprint(&buf, "\n/assign")
   437  		for user := range ownersMap {
   438  			fmt.Fprintf(&buf, " @%s", user)
   439  		}
   440  		fmt.Fprint(&buf, "\n")
   441  	}
   442  
   443  	// Explanations of assignees and sigs
   444  	fmt.Fprint(&buf, c.filer.creator.ExplainTestAssignments(testNames))
   445  
   446  	fmt.Fprintf(&buf, "\n[Current Status](%s#%s)", triageURL, c.Identifier)
   447  
   448  	return buf.String()
   449  }
   450  
   451  // ID yields the string identifier that uniquely identifies this issue.
   452  // This ID must appear in the body of the issue.
   453  // DO NOT CHANGE how this ID is formatted or duplicate issues may be created on github.
   454  func (c *Cluster) ID() string {
   455  	return c.Identifier
   456  }
   457  
   458  // Labels returns the labels to apply to the issue created for this cluster on github.
   459  func (c *Cluster) Labels() []string {
   460  	labels := []string{"kind/flake"}
   461  
   462  	topTests := make([]string, len(c.Tests))
   463  	for i, test := range c.topTestsFailed(len(c.Tests)) {
   464  		topTests[i] = test.Name
   465  	}
   466  	for sig := range c.filer.creator.TestsSIGs(topTests) {
   467  		labels = append(labels, "sig/"+sig)
   468  	}
   469  
   470  	return labels
   471  }
   472  
   473  // Owners returns the list of usernames to assign to this issue on github.
   474  func (c *Cluster) Owners() []string {
   475  	// Assign owners by including a /assign command in the body instead of using Owners to set
   476  	// assignees on the issue request. This lets prow do the assignee validation and will mention
   477  	// the user we want to assign even if they can't be assigned.
   478  	return nil
   479  }
   480  
   481  // Priority calculates and returns the priority of this issue.
   482  // The returned bool indicates if the returned priority is valid and can be used.
   483  func (c *Cluster) Priority() (string, bool) {
   484  	// TODO implement priority calcs later.
   485  	return "", false
   486  }