github.com/abayer/test-infra@v0.0.5/robots/issue-creator/sources/triage-filer.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package sources 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "flag" 23 "fmt" 24 "reflect" 25 "sort" 26 "strconv" 27 "strings" 28 "time" 29 30 githubapi "github.com/google/go-github/github" 31 "k8s.io/test-infra/mungegithub/mungers/mungerutil" 32 "k8s.io/test-infra/robots/issue-creator/creator" 33 ) 34 35 const ( 36 timeFormat = "2 Jan 2006 15:04 MST" 37 38 // Configuration constants. 39 topJobsCount = 3 40 topTestsCount = 3 41 triageURL = "https://go.k8s.io/triage" 42 clusterDataURL = "https://storage.googleapis.com/k8s-gubernator/triage/failure_data.json" 43 ) 44 45 // TriageFiler files issues for clustered test failures. 46 type TriageFiler struct { 47 topClustersCount int 48 windowDays int 49 50 nextSync time.Time 51 latestStart int64 52 53 creator *creator.IssueCreator 54 data *triageData 55 } 56 57 func init() { 58 creator.RegisterSourceOrDie("triage-filer", &TriageFiler{}) 59 } 60 61 // Issues is the main work function of the TriageFiler. It fetches and parses cluster data, 62 // then syncs the top issues to github with the IssueCreator. 63 func (f *TriageFiler) Issues(c *creator.IssueCreator) ([]creator.Issue, error) { 64 f.creator = c 65 rawjson, err := mungerutil.ReadHTTP(clusterDataURL) 66 if err != nil { 67 return nil, err 68 } 69 clusters, err := f.loadClusters(rawjson) 70 if err != nil { 71 return nil, err 72 } 73 topclusters := topClusters(clusters, f.topClustersCount) 74 issues := make([]creator.Issue, 0, len(topclusters)) 75 for _, clust := range topclusters { 76 issues = append(issues, clust) 77 } 78 return issues, nil 79 } 80 81 // RegisterFlags registers options for this munger; returns any that require a restart when changed. 82 func (f *TriageFiler) RegisterFlags() { 83 flag.IntVar(&f.topClustersCount, "triage-count", 3, "The number of clusters to sync issues for on github.") 84 flag.IntVar(&f.windowDays, "triage-window", 1, "The size of the sliding time window (in days) that is used to determine which failures to consider.") 85 } 86 87 // triageData is a struct that represents the format of the JSON triage data and is used for parsing. 88 type triageData struct { 89 Builds struct { 90 Cols struct { 91 Elapsed []int `json:"elapsed"` 92 Executor []string `json:"executor"` 93 PR []string `json:"pr"` 94 Result []string `json:"result"` 95 Started []int64 `json:"started"` 96 TestsFailed []int `json:"tests_failed"` 97 TestsRun []int `json:"tests_run"` 98 } `json:"cols"` 99 JobsRaw map[string]interface{} `json:"jobs"` // []int or map[string]int 100 Jobs map[string]BuildIndexer 101 JobPaths map[string]string `json:"job_paths"` 102 } `json:"builds"` 103 Clustered []*Cluster `json:"clustered"` 104 } 105 106 // Cluster holds information about a failure cluster. 107 type Cluster struct { 108 Identifier string `json:"id"` 109 Key string `json:"key"` 110 Text string `json:"text"` 111 Tests []*Test `json:"tests"` 112 113 filer *TriageFiler 114 jobs map[string][]int 115 totalBuilds int 116 totalJobs int 117 totalTests int 118 } 119 120 // Test holds a name and list of jobs 121 type Test struct { 122 Name string `json:"name"` 123 Jobs []*Job `json:"jobs"` 124 } 125 126 // Job holds a name and list of build numbers 127 type Job struct { 128 Name string `json:"name"` 129 Builds []int `json:"builds"` 130 } 131 132 // filterAndValidate removes failure data that falls outside the time window and ensures that cluster 133 // data is well formed. It also removes data for PR jobs so that only post-submit failures are considered. 134 func (f *TriageFiler) filterAndValidate(windowDays int) error { 135 f.latestStart = int64(0) 136 for _, start := range f.data.Builds.Cols.Started { 137 if start > f.latestStart { 138 f.latestStart = start 139 } 140 } 141 cutoffTime := time.Unix(f.latestStart, 0).AddDate(0, 0, -windowDays).Unix() 142 143 validClusts := []*Cluster{} 144 for clustIndex, clust := range f.data.Clustered { 145 if len(clust.Identifier) == 0 { 146 return fmt.Errorf("the cluster at index %d in the triage JSON data does not specify an ID", clustIndex) 147 } 148 if clust.Tests == nil { 149 return fmt.Errorf("cluster '%s' does not have a 'tests' key", clust.Identifier) 150 } 151 validTests := []*Test{} 152 for _, test := range clust.Tests { 153 if len(test.Name) == 0 { 154 return fmt.Errorf("cluster '%s' contains a test without a name", clust.Identifier) 155 } 156 if test.Jobs == nil { 157 return fmt.Errorf("cluster '%s' does not have a 'jobs' key", clust.Identifier) 158 } 159 validJobs := []*Job{} 160 for _, job := range test.Jobs { 161 if len(job.Name) == 0 { 162 return fmt.Errorf("cluster '%s' contains a job without a name under test '%s'", clust.Identifier, test.Name) 163 } 164 // Filter out PR jobs 165 if strings.HasPrefix(job.Name, "pr:") { 166 continue 167 } 168 if len(job.Builds) == 0 { 169 return fmt.Errorf("cluster '%s' contains job '%s' under test '%s' with no failing builds", clust.Identifier, job.Name, test.Name) 170 } 171 validBuilds := []int{} 172 rowMap, ok := f.data.Builds.Jobs[job.Name] 173 if !ok { 174 return fmt.Errorf("triage json data does not contain buildnum to row index mapping for job '%s'", job.Name) 175 } 176 for _, buildnum := range job.Builds { 177 row, err := rowMap.rowForBuild(buildnum) 178 if err != nil { 179 return err 180 } 181 if f.data.Builds.Cols.Started[row] > cutoffTime { 182 validBuilds = append(validBuilds, buildnum) 183 } 184 } 185 if len(validBuilds) > 0 { 186 job.Builds = validBuilds 187 validJobs = append(validJobs, job) 188 } 189 } 190 if len(validJobs) > 0 { 191 test.Jobs = validJobs 192 validTests = append(validTests, test) 193 } 194 } 195 if len(validTests) > 0 { 196 clust.Tests = validTests 197 validClusts = append(validClusts, clust) 198 } 199 } 200 f.data.Clustered = validClusts 201 return nil 202 } 203 204 // BuildIndexer is an interface that describes the buildnum to row index mapping used to retrieve data 205 // about individual builds from the JSON file. 206 // This is an interface because the JSON format describing failure clusters has 2 ways of recording the mapping info. 207 type BuildIndexer interface { 208 rowForBuild(buildnum int) (int, error) 209 } 210 211 // ContigIndexer is a BuildIndexer implementation for when the buildnum to row index mapping describes 212 // a contiguous set of rows via 3 ints. 213 type ContigIndexer struct { 214 startRow, startBuild, count int 215 } 216 217 func (rowMap ContigIndexer) rowForBuild(buildnum int) (int, error) { 218 if buildnum < rowMap.startBuild || buildnum > rowMap.startBuild+rowMap.count-1 { 219 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum) 220 } 221 return buildnum - rowMap.startBuild + rowMap.startRow, nil 222 } 223 224 // DictIndexer is a BuildIndexer implementation for when the buildnum to row index mapping is simply a dictionary. 225 // The value type of this dictionary is interface instead of int so that we don't have to convert the original map. 226 type DictIndexer map[string]interface{} 227 228 func (rowMap DictIndexer) rowForBuild(buildnum int) (int, error) { 229 row, ok := rowMap[strconv.Itoa(buildnum)] 230 if !ok { 231 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum) 232 } 233 var irow float64 234 if irow, ok = row.(float64); !ok { 235 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping contains invalid type", buildnum) 236 } 237 return int(irow), nil 238 } 239 240 // loadClusters parses and filters the json data, then populates every Cluster struct with 241 // aggregated job data and totals. The job data specifies all jobs that failed in a cluster and the 242 // builds that failed for each job, independent of which tests the jobs or builds failed. 243 func (f *TriageFiler) loadClusters(jsonIn []byte) ([]*Cluster, error) { 244 var err error 245 f.data, err = parseTriageData(jsonIn) 246 if err != nil { 247 return nil, err 248 } 249 if err = f.filterAndValidate(f.windowDays); err != nil { 250 return nil, err 251 } 252 253 // Aggregate failing builds in each cluster by job (independent of tests). 254 for _, clust := range f.data.Clustered { 255 clust.filer = f 256 clust.jobs = make(map[string][]int) 257 258 for _, test := range clust.Tests { 259 for _, job := range test.Jobs { 260 for _, buildnum := range job.Builds { 261 found := false 262 for _, oldBuild := range clust.jobs[job.Name] { 263 if oldBuild == buildnum { 264 found = true 265 break 266 } 267 } 268 if !found { 269 clust.jobs[job.Name] = append(clust.jobs[job.Name], buildnum) 270 } 271 } 272 } 273 } 274 clust.totalJobs = len(clust.jobs) 275 clust.totalTests = len(clust.Tests) 276 clust.totalBuilds = 0 277 for _, builds := range clust.jobs { 278 clust.totalBuilds += len(builds) 279 } 280 } 281 return f.data.Clustered, nil 282 } 283 284 // parseTriageData unmarshals raw json data into a triageData struct and creates a BuildIndexer for 285 // every job. 286 func parseTriageData(jsonIn []byte) (*triageData, error) { 287 var data triageData 288 if err := json.Unmarshal(jsonIn, &data); err != nil { 289 return nil, err 290 } 291 292 if data.Builds.Cols.Started == nil { 293 return nil, fmt.Errorf("triage data json is missing the builds.cols.started key") 294 } 295 if data.Builds.JobsRaw == nil { 296 return nil, fmt.Errorf("triage data is missing the builds.jobs key") 297 } 298 if data.Builds.JobPaths == nil { 299 return nil, fmt.Errorf("triage data is missing the builds.job_paths key") 300 } 301 if data.Clustered == nil { 302 return nil, fmt.Errorf("triage data is missing the clustered key") 303 } 304 // Populate 'Jobs' with the BuildIndexer for each job. 305 data.Builds.Jobs = make(map[string]BuildIndexer) 306 for jobID, mapper := range data.Builds.JobsRaw { 307 switch mapper := mapper.(type) { 308 case []interface{}: 309 // In this case mapper is a 3 member array. 0:first buildnum, 1:number of builds, 2:start index. 310 data.Builds.Jobs[jobID] = ContigIndexer{ 311 startBuild: int(mapper[0].(float64)), 312 count: int(mapper[1].(float64)), 313 startRow: int(mapper[2].(float64)), 314 } 315 case map[string]interface{}: 316 // In this case mapper is a dictionary. 317 data.Builds.Jobs[jobID] = DictIndexer(mapper) 318 default: 319 return nil, fmt.Errorf("the build number to row index mapping for job '%s' is not an accepted type. Type is: %v", jobID, reflect.TypeOf(mapper)) 320 } 321 } 322 return &data, nil 323 } 324 325 // topClusters gets the 'count' most important clusters from a slice of clusters based on number of build failures. 326 func topClusters(clusters []*Cluster, count int) []*Cluster { 327 less := func(i, j int) bool { return clusters[i].totalBuilds > clusters[j].totalBuilds } 328 sort.SliceStable(clusters, less) 329 330 if len(clusters) < count { 331 count = len(clusters) 332 } 333 return clusters[0:count] 334 } 335 336 // topTestsFailing returns the top 'count' test names sorted by number of failing jobs. 337 func (c *Cluster) topTestsFailed(count int) []*Test { 338 less := func(i, j int) bool { return len(c.Tests[i].Jobs) > len(c.Tests[j].Jobs) } 339 sort.SliceStable(c.Tests, less) 340 341 if len(c.Tests) < count { 342 count = len(c.Tests) 343 } 344 return c.Tests[0:count] 345 } 346 347 // topJobsFailed returns the top 'count' job names sorted by number of failing builds. 348 func (c *Cluster) topJobsFailed(count int) []*Job { 349 slice := make([]*Job, len(c.jobs)) 350 i := 0 351 for jobName, builds := range c.jobs { 352 slice[i] = &Job{Name: jobName, Builds: builds} 353 i++ 354 } 355 less := func(i, j int) bool { return len(slice[i].Builds) > len(slice[j].Builds) } 356 sort.SliceStable(slice, less) 357 358 if len(slice) < count { 359 count = len(slice) 360 } 361 return slice[0:count] 362 } 363 364 // Title is the string to use as the github issue title. 365 func (c *Cluster) Title() string { 366 return fmt.Sprintf("Failure cluster [%s...] failed %d builds, %d jobs, and %d tests over %d days", 367 c.Identifier[0:6], 368 c.totalBuilds, 369 c.totalJobs, 370 c.totalTests, 371 c.filer.windowDays, 372 ) 373 } 374 375 // Body returns the body text of the github issue and *must* contain the output of ID(). 376 // closedIssues is a (potentially empty) slice containing all closed issues authored by this bot 377 // that contain ID() in their body. 378 // If Body returns an empty string no issue is created. 379 func (c *Cluster) Body(closedIssues []*githubapi.Issue) string { 380 // First check that the most recently closed issue (if any exist) was closed 381 // before the start of the sliding window. 382 cutoffTime := time.Unix(c.filer.latestStart, 0).AddDate(0, 0, -c.filer.windowDays) 383 for _, closed := range closedIssues { 384 if closed.ClosedAt.After(cutoffTime) { 385 return "" 386 } 387 } 388 389 var buf bytes.Buffer 390 fmt.Fprintf(&buf, "### Failure cluster [%s](%s#%s)\n", c.ID(), triageURL, c.Identifier) 391 fmt.Fprintf(&buf, "##### Error text:\n```\n%s\n```\n", c.Text) 392 // cluster stats 393 fmt.Fprint(&buf, "##### Failure cluster statistics:\n") 394 fmt.Fprintf(&buf, "%d tests failed, %d jobs failed, %d builds failed.\n", c.totalTests, c.totalJobs, c.totalBuilds) 395 fmt.Fprintf(&buf, "Failure stats cover %d day time range '%s' to '%s'.\n##### Top failed tests by jobs failed:\n", 396 c.filer.windowDays, 397 cutoffTime.Format(timeFormat), 398 time.Unix(c.filer.latestStart, 0).Format(timeFormat)) 399 // top tests failed 400 fmt.Fprint(&buf, "\n| Test Name | Jobs Failed |\n| --- | --- |\n") 401 for _, test := range c.topTestsFailed(topTestsCount) { 402 fmt.Fprintf(&buf, "| %s | %d |\n", test.Name, len(test.Jobs)) 403 } 404 // top jobs failed 405 fmt.Fprint(&buf, "\n##### Top failed jobs by builds failed:\n") 406 fmt.Fprint(&buf, "\n| Job Name | Builds Failed | Latest Failure |\n| --- | --- | --- |\n") 407 for _, job := range c.topJobsFailed(topJobsCount) { 408 latest := 0 409 latestTime := int64(0) 410 rowMap := c.filer.data.Builds.Jobs[job.Name] 411 for _, build := range job.Builds { 412 row, _ := rowMap.rowForBuild(build) // Already validated start time lookup for all builds. 413 buildTime := c.filer.data.Builds.Cols.Started[row] 414 if buildTime > latestTime { 415 latestTime = buildTime 416 latest = build 417 } 418 } 419 path := strings.TrimPrefix(c.filer.data.Builds.JobPaths[job.Name], "gs://") 420 fmt.Fprintf(&buf, "| %s | %d | [%s](https://k8s-gubernator.appspot.com/build/%s/%d) |\n", job.Name, len(job.Builds), time.Unix(latestTime, 0).Format(timeFormat), path, latest) 421 } 422 // previously closed issues if there are any 423 if len(closedIssues) > 0 { 424 fmt.Fprint(&buf, "\n##### Previously closed issues for this cluster:\n") 425 for _, closed := range closedIssues { 426 fmt.Fprintf(&buf, "#%d ", *closed.Number) 427 } 428 fmt.Fprint(&buf, "\n") 429 } 430 431 // Create /assign command. 432 testNames := make([]string, 0, len(c.Tests)) 433 for _, test := range c.topTestsFailed(len(c.Tests)) { 434 testNames = append(testNames, test.Name) 435 } 436 ownersMap := c.filer.creator.TestsOwners(testNames) 437 if len(ownersMap) > 0 { 438 fmt.Fprint(&buf, "\n/assign") 439 for user := range ownersMap { 440 fmt.Fprintf(&buf, " @%s", user) 441 } 442 fmt.Fprint(&buf, "\n") 443 } 444 445 // Explanations of assignees and sigs 446 fmt.Fprint(&buf, c.filer.creator.ExplainTestAssignments(testNames)) 447 448 fmt.Fprintf(&buf, "\n[Current Status](%s#%s)", triageURL, c.Identifier) 449 450 return buf.String() 451 } 452 453 // ID yields the string identifier that uniquely identifies this issue. 454 // This ID must appear in the body of the issue. 455 // DO NOT CHANGE how this ID is formatted or duplicate issues may be created on github. 456 func (c *Cluster) ID() string { 457 return c.Identifier 458 } 459 460 // Labels returns the labels to apply to the issue created for this cluster on github. 461 func (c *Cluster) Labels() []string { 462 labels := []string{"kind/flake"} 463 464 topTests := make([]string, len(c.Tests)) 465 for i, test := range c.topTestsFailed(len(c.Tests)) { 466 topTests[i] = test.Name 467 } 468 for sig := range c.filer.creator.TestsSIGs(topTests) { 469 labels = append(labels, "sig/"+sig) 470 } 471 472 return labels 473 } 474 475 // Owners returns the list of usernames to assign to this issue on github. 476 func (c *Cluster) Owners() []string { 477 // Assign owners by including a /assign command in the body instead of using Owners to set 478 // assignees on the issue request. This lets prow do the assignee validation and will mention 479 // the user we want to assign even if they can't be assigned. 480 return nil 481 } 482 483 // Priority calculates and returns the priority of this issue. 484 // The returned bool indicates if the returned priority is valid and can be used. 485 func (c *Cluster) Priority() (string, bool) { 486 // TODO implement priority calcs later. 487 return "", false 488 }