github.com/munnerz/test-infra@v0.0.0-20190108210205-ce3d181dc989/robots/issue-creator/sources/triage-filer.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package sources 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "flag" 23 "fmt" 24 "reflect" 25 "sort" 26 "strconv" 27 "strings" 28 "time" 29 30 githubapi "github.com/google/go-github/github" 31 "k8s.io/test-infra/robots/issue-creator/creator" 32 ) 33 34 const ( 35 timeFormat = "2 Jan 2006 15:04 MST" 36 37 // Configuration constants. 38 topJobsCount = 3 39 topTestsCount = 3 40 triageURL = "https://go.k8s.io/triage" 41 clusterDataURL = "https://storage.googleapis.com/k8s-gubernator/triage/failure_data.json" 42 ) 43 44 // TriageFiler files issues for clustered test failures. 45 type TriageFiler struct { 46 topClustersCount int 47 windowDays int 48 49 nextSync time.Time 50 latestStart int64 51 52 creator *creator.IssueCreator 53 data *triageData 54 } 55 56 func init() { 57 creator.RegisterSourceOrDie("triage-filer", &TriageFiler{}) 58 } 59 60 // Issues is the main work function of the TriageFiler. It fetches and parses cluster data, 61 // then syncs the top issues to github with the IssueCreator. 62 func (f *TriageFiler) Issues(c *creator.IssueCreator) ([]creator.Issue, error) { 63 f.creator = c 64 rawjson, err := ReadHTTP(clusterDataURL) 65 if err != nil { 66 return nil, err 67 } 68 clusters, err := f.loadClusters(rawjson) 69 if err != nil { 70 return nil, err 71 } 72 topclusters := topClusters(clusters, f.topClustersCount) 73 issues := make([]creator.Issue, 0, len(topclusters)) 74 for _, clust := range topclusters { 75 issues = append(issues, clust) 76 } 77 return issues, nil 78 } 79 80 // RegisterFlags registers options for this munger; returns any that require a restart when changed. 81 func (f *TriageFiler) RegisterFlags() { 82 flag.IntVar(&f.topClustersCount, "triage-count", 3, "The number of clusters to sync issues for on github.") 83 flag.IntVar(&f.windowDays, "triage-window", 1, "The size of the sliding time window (in days) that is used to determine which failures to consider.") 84 } 85 86 // triageData is a struct that represents the format of the JSON triage data and is used for parsing. 87 type triageData struct { 88 Builds struct { 89 Cols struct { 90 Elapsed []int `json:"elapsed"` 91 Executor []string `json:"executor"` 92 PR []string `json:"pr"` 93 Result []string `json:"result"` 94 Started []int64 `json:"started"` 95 TestsFailed []int `json:"tests_failed"` 96 TestsRun []int `json:"tests_run"` 97 } `json:"cols"` 98 JobsRaw map[string]interface{} `json:"jobs"` // []int or map[string]int 99 Jobs map[string]BuildIndexer 100 JobPaths map[string]string `json:"job_paths"` 101 } `json:"builds"` 102 Clustered []*Cluster `json:"clustered"` 103 } 104 105 // Cluster holds information about a failure cluster. 106 type Cluster struct { 107 Identifier string `json:"id"` 108 Key string `json:"key"` 109 Text string `json:"text"` 110 Tests []*Test `json:"tests"` 111 112 filer *TriageFiler 113 jobs map[string][]int 114 totalBuilds int 115 totalJobs int 116 totalTests int 117 } 118 119 // Test holds a name and list of jobs 120 type Test struct { 121 Name string `json:"name"` 122 Jobs []*Job `json:"jobs"` 123 } 124 125 // Job holds a name and list of build numbers 126 type Job struct { 127 Name string `json:"name"` 128 Builds []int `json:"builds"` 129 } 130 131 // filterAndValidate removes failure data that falls outside the time window and ensures that cluster 132 // data is well formed. It also removes data for PR jobs so that only post-submit failures are considered. 133 func (f *TriageFiler) filterAndValidate(windowDays int) error { 134 f.latestStart = int64(0) 135 for _, start := range f.data.Builds.Cols.Started { 136 if start > f.latestStart { 137 f.latestStart = start 138 } 139 } 140 cutoffTime := time.Unix(f.latestStart, 0).AddDate(0, 0, -windowDays).Unix() 141 142 validClusts := []*Cluster{} 143 for clustIndex, clust := range f.data.Clustered { 144 if len(clust.Identifier) == 0 { 145 return fmt.Errorf("the cluster at index %d in the triage JSON data does not specify an ID", clustIndex) 146 } 147 if clust.Tests == nil { 148 return fmt.Errorf("cluster '%s' does not have a 'tests' key", clust.Identifier) 149 } 150 validTests := []*Test{} 151 for _, test := range clust.Tests { 152 if len(test.Name) == 0 { 153 return fmt.Errorf("cluster '%s' contains a test without a name", clust.Identifier) 154 } 155 if test.Jobs == nil { 156 return fmt.Errorf("cluster '%s' does not have a 'jobs' key", clust.Identifier) 157 } 158 validJobs := []*Job{} 159 for _, job := range test.Jobs { 160 if len(job.Name) == 0 { 161 return fmt.Errorf("cluster '%s' contains a job without a name under test '%s'", clust.Identifier, test.Name) 162 } 163 // Filter out PR jobs 164 if strings.HasPrefix(job.Name, "pr:") { 165 continue 166 } 167 if len(job.Builds) == 0 { 168 return fmt.Errorf("cluster '%s' contains job '%s' under test '%s' with no failing builds", clust.Identifier, job.Name, test.Name) 169 } 170 validBuilds := []int{} 171 rowMap, ok := f.data.Builds.Jobs[job.Name] 172 if !ok { 173 return fmt.Errorf("triage json data does not contain buildnum to row index mapping for job '%s'", job.Name) 174 } 175 for _, buildnum := range job.Builds { 176 row, err := rowMap.rowForBuild(buildnum) 177 if err != nil { 178 return err 179 } 180 if f.data.Builds.Cols.Started[row] > cutoffTime { 181 validBuilds = append(validBuilds, buildnum) 182 } 183 } 184 if len(validBuilds) > 0 { 185 job.Builds = validBuilds 186 validJobs = append(validJobs, job) 187 } 188 } 189 if len(validJobs) > 0 { 190 test.Jobs = validJobs 191 validTests = append(validTests, test) 192 } 193 } 194 if len(validTests) > 0 { 195 clust.Tests = validTests 196 validClusts = append(validClusts, clust) 197 } 198 } 199 f.data.Clustered = validClusts 200 return nil 201 } 202 203 // BuildIndexer is an interface that describes the buildnum to row index mapping used to retrieve data 204 // about individual builds from the JSON file. 205 // This is an interface because the JSON format describing failure clusters has 2 ways of recording the mapping info. 206 type BuildIndexer interface { 207 rowForBuild(buildnum int) (int, error) 208 } 209 210 // ContigIndexer is a BuildIndexer implementation for when the buildnum to row index mapping describes 211 // a contiguous set of rows via 3 ints. 212 type ContigIndexer struct { 213 startRow, startBuild, count int 214 } 215 216 func (rowMap ContigIndexer) rowForBuild(buildnum int) (int, error) { 217 if buildnum < rowMap.startBuild || buildnum > rowMap.startBuild+rowMap.count-1 { 218 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum) 219 } 220 return buildnum - rowMap.startBuild + rowMap.startRow, nil 221 } 222 223 // DictIndexer is a BuildIndexer implementation for when the buildnum to row index mapping is simply a dictionary. 224 // The value type of this dictionary is interface instead of int so that we don't have to convert the original map. 225 type DictIndexer map[string]interface{} 226 227 func (rowMap DictIndexer) rowForBuild(buildnum int) (int, error) { 228 row, ok := rowMap[strconv.Itoa(buildnum)] 229 if !ok { 230 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum) 231 } 232 var irow float64 233 if irow, ok = row.(float64); !ok { 234 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping contains invalid type", buildnum) 235 } 236 return int(irow), nil 237 } 238 239 // loadClusters parses and filters the json data, then populates every Cluster struct with 240 // aggregated job data and totals. The job data specifies all jobs that failed in a cluster and the 241 // builds that failed for each job, independent of which tests the jobs or builds failed. 242 func (f *TriageFiler) loadClusters(jsonIn []byte) ([]*Cluster, error) { 243 var err error 244 f.data, err = parseTriageData(jsonIn) 245 if err != nil { 246 return nil, err 247 } 248 if err = f.filterAndValidate(f.windowDays); err != nil { 249 return nil, err 250 } 251 252 // Aggregate failing builds in each cluster by job (independent of tests). 253 for _, clust := range f.data.Clustered { 254 clust.filer = f 255 clust.jobs = make(map[string][]int) 256 257 for _, test := range clust.Tests { 258 for _, job := range test.Jobs { 259 for _, buildnum := range job.Builds { 260 found := false 261 for _, oldBuild := range clust.jobs[job.Name] { 262 if oldBuild == buildnum { 263 found = true 264 break 265 } 266 } 267 if !found { 268 clust.jobs[job.Name] = append(clust.jobs[job.Name], buildnum) 269 } 270 } 271 } 272 } 273 clust.totalJobs = len(clust.jobs) 274 clust.totalTests = len(clust.Tests) 275 clust.totalBuilds = 0 276 for _, builds := range clust.jobs { 277 clust.totalBuilds += len(builds) 278 } 279 } 280 return f.data.Clustered, nil 281 } 282 283 // parseTriageData unmarshals raw json data into a triageData struct and creates a BuildIndexer for 284 // every job. 285 func parseTriageData(jsonIn []byte) (*triageData, error) { 286 var data triageData 287 if err := json.Unmarshal(jsonIn, &data); err != nil { 288 return nil, err 289 } 290 291 if data.Builds.Cols.Started == nil { 292 return nil, fmt.Errorf("triage data json is missing the builds.cols.started key") 293 } 294 if data.Builds.JobsRaw == nil { 295 return nil, fmt.Errorf("triage data is missing the builds.jobs key") 296 } 297 if data.Builds.JobPaths == nil { 298 return nil, fmt.Errorf("triage data is missing the builds.job_paths key") 299 } 300 if data.Clustered == nil { 301 return nil, fmt.Errorf("triage data is missing the clustered key") 302 } 303 // Populate 'Jobs' with the BuildIndexer for each job. 304 data.Builds.Jobs = make(map[string]BuildIndexer) 305 for jobID, mapper := range data.Builds.JobsRaw { 306 switch mapper := mapper.(type) { 307 case []interface{}: 308 // In this case mapper is a 3 member array. 0:first buildnum, 1:number of builds, 2:start index. 309 data.Builds.Jobs[jobID] = ContigIndexer{ 310 startBuild: int(mapper[0].(float64)), 311 count: int(mapper[1].(float64)), 312 startRow: int(mapper[2].(float64)), 313 } 314 case map[string]interface{}: 315 // In this case mapper is a dictionary. 316 data.Builds.Jobs[jobID] = DictIndexer(mapper) 317 default: 318 return nil, fmt.Errorf("the build number to row index mapping for job '%s' is not an accepted type. Type is: %v", jobID, reflect.TypeOf(mapper)) 319 } 320 } 321 return &data, nil 322 } 323 324 // topClusters gets the 'count' most important clusters from a slice of clusters based on number of build failures. 325 func topClusters(clusters []*Cluster, count int) []*Cluster { 326 less := func(i, j int) bool { return clusters[i].totalBuilds > clusters[j].totalBuilds } 327 sort.SliceStable(clusters, less) 328 329 if len(clusters) < count { 330 count = len(clusters) 331 } 332 return clusters[0:count] 333 } 334 335 // topTestsFailing returns the top 'count' test names sorted by number of failing jobs. 336 func (c *Cluster) topTestsFailed(count int) []*Test { 337 less := func(i, j int) bool { return len(c.Tests[i].Jobs) > len(c.Tests[j].Jobs) } 338 sort.SliceStable(c.Tests, less) 339 340 if len(c.Tests) < count { 341 count = len(c.Tests) 342 } 343 return c.Tests[0:count] 344 } 345 346 // topJobsFailed returns the top 'count' job names sorted by number of failing builds. 347 func (c *Cluster) topJobsFailed(count int) []*Job { 348 slice := make([]*Job, len(c.jobs)) 349 i := 0 350 for jobName, builds := range c.jobs { 351 slice[i] = &Job{Name: jobName, Builds: builds} 352 i++ 353 } 354 less := func(i, j int) bool { return len(slice[i].Builds) > len(slice[j].Builds) } 355 sort.SliceStable(slice, less) 356 357 if len(slice) < count { 358 count = len(slice) 359 } 360 return slice[0:count] 361 } 362 363 // Title is the string to use as the github issue title. 364 func (c *Cluster) Title() string { 365 return fmt.Sprintf("Failure cluster [%s...] failed %d builds, %d jobs, and %d tests over %d days", 366 c.Identifier[0:6], 367 c.totalBuilds, 368 c.totalJobs, 369 c.totalTests, 370 c.filer.windowDays, 371 ) 372 } 373 374 // Body returns the body text of the github issue and *must* contain the output of ID(). 375 // closedIssues is a (potentially empty) slice containing all closed issues authored by this bot 376 // that contain ID() in their body. 377 // If Body returns an empty string no issue is created. 378 func (c *Cluster) Body(closedIssues []*githubapi.Issue) string { 379 // First check that the most recently closed issue (if any exist) was closed 380 // before the start of the sliding window. 381 cutoffTime := time.Unix(c.filer.latestStart, 0).AddDate(0, 0, -c.filer.windowDays) 382 for _, closed := range closedIssues { 383 if closed.ClosedAt.After(cutoffTime) { 384 return "" 385 } 386 } 387 388 var buf bytes.Buffer 389 fmt.Fprintf(&buf, "### Failure cluster [%s](%s#%s)\n", c.ID(), triageURL, c.Identifier) 390 fmt.Fprintf(&buf, "##### Error text:\n```\n%s\n```\n", c.Text) 391 // cluster stats 392 fmt.Fprint(&buf, "##### Failure cluster statistics:\n") 393 fmt.Fprintf(&buf, "%d tests failed, %d jobs failed, %d builds failed.\n", c.totalTests, c.totalJobs, c.totalBuilds) 394 fmt.Fprintf(&buf, "Failure stats cover %d day time range '%s' to '%s'.\n##### Top failed tests by jobs failed:\n", 395 c.filer.windowDays, 396 cutoffTime.Format(timeFormat), 397 time.Unix(c.filer.latestStart, 0).Format(timeFormat)) 398 // top tests failed 399 fmt.Fprint(&buf, "\n| Test Name | Jobs Failed |\n| --- | --- |\n") 400 for _, test := range c.topTestsFailed(topTestsCount) { 401 fmt.Fprintf(&buf, "| %s | %d |\n", test.Name, len(test.Jobs)) 402 } 403 // top jobs failed 404 fmt.Fprint(&buf, "\n##### Top failed jobs by builds failed:\n") 405 fmt.Fprint(&buf, "\n| Job Name | Builds Failed | Latest Failure |\n| --- | --- | --- |\n") 406 for _, job := range c.topJobsFailed(topJobsCount) { 407 latest := 0 408 latestTime := int64(0) 409 rowMap := c.filer.data.Builds.Jobs[job.Name] 410 for _, build := range job.Builds { 411 row, _ := rowMap.rowForBuild(build) // Already validated start time lookup for all builds. 412 buildTime := c.filer.data.Builds.Cols.Started[row] 413 if buildTime > latestTime { 414 latestTime = buildTime 415 latest = build 416 } 417 } 418 path := strings.TrimPrefix(c.filer.data.Builds.JobPaths[job.Name], "gs://") 419 fmt.Fprintf(&buf, "| %s | %d | [%s](https://gubernator.k8s.io/build/%s/%d) |\n", job.Name, len(job.Builds), time.Unix(latestTime, 0).Format(timeFormat), path, latest) 420 } 421 // previously closed issues if there are any 422 if len(closedIssues) > 0 { 423 fmt.Fprint(&buf, "\n##### Previously closed issues for this cluster:\n") 424 for _, closed := range closedIssues { 425 fmt.Fprintf(&buf, "#%d ", *closed.Number) 426 } 427 fmt.Fprint(&buf, "\n") 428 } 429 430 // Create /assign command. 431 testNames := make([]string, 0, len(c.Tests)) 432 for _, test := range c.topTestsFailed(len(c.Tests)) { 433 testNames = append(testNames, test.Name) 434 } 435 ownersMap := c.filer.creator.TestsOwners(testNames) 436 if len(ownersMap) > 0 { 437 fmt.Fprint(&buf, "\n/assign") 438 for user := range ownersMap { 439 fmt.Fprintf(&buf, " @%s", user) 440 } 441 fmt.Fprint(&buf, "\n") 442 } 443 444 // Explanations of assignees and sigs 445 fmt.Fprint(&buf, c.filer.creator.ExplainTestAssignments(testNames)) 446 447 fmt.Fprintf(&buf, "\n[Current Status](%s#%s)", triageURL, c.Identifier) 448 449 return buf.String() 450 } 451 452 // ID yields the string identifier that uniquely identifies this issue. 453 // This ID must appear in the body of the issue. 454 // DO NOT CHANGE how this ID is formatted or duplicate issues may be created on github. 455 func (c *Cluster) ID() string { 456 return c.Identifier 457 } 458 459 // Labels returns the labels to apply to the issue created for this cluster on github. 460 func (c *Cluster) Labels() []string { 461 labels := []string{"kind/flake"} 462 463 topTests := make([]string, len(c.Tests)) 464 for i, test := range c.topTestsFailed(len(c.Tests)) { 465 topTests[i] = test.Name 466 } 467 for sig := range c.filer.creator.TestsSIGs(topTests) { 468 labels = append(labels, "sig/"+sig) 469 } 470 471 return labels 472 } 473 474 // Owners returns the list of usernames to assign to this issue on github. 475 func (c *Cluster) Owners() []string { 476 // Assign owners by including a /assign command in the body instead of using Owners to set 477 // assignees on the issue request. This lets prow do the assignee validation and will mention 478 // the user we want to assign even if they can't be assigned. 479 return nil 480 } 481 482 // Priority calculates and returns the priority of this issue. 483 // The returned bool indicates if the returned priority is valid and can be used. 484 func (c *Cluster) Priority() (string, bool) { 485 // TODO implement priority calcs later. 486 return "", false 487 }