github.com/shashidharatd/test-infra@v0.0.0-20171006011030-71304e1ca560/robots/issue-creator/sources/triage-filer.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package sources 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "flag" 23 "fmt" 24 "reflect" 25 "sort" 26 "strconv" 27 "strings" 28 "time" 29 30 githubapi "github.com/google/go-github/github" 31 "k8s.io/test-infra/mungegithub/mungers/mungerutil" 32 "k8s.io/test-infra/robots/issue-creator/creator" 33 ) 34 35 const ( 36 timeFormat = "2 Jan 2006 15:04 MST" 37 38 // Configuration constants. 39 topJobsCount = 3 40 topTestsCount = 3 41 triageURL = "https://go.k8s.io/triage" 42 clusterDataURL = "https://storage.googleapis.com/k8s-gubernator/triage/failure_data.json" 43 ) 44 45 // TriageFiler files issues for clustered test failures. 46 type TriageFiler struct { 47 topClustersCount int 48 windowDays int 49 50 nextSync time.Time 51 latestStart int64 52 53 creator *creator.IssueCreator 54 data *triageData 55 } 56 57 func init() { 58 creator.RegisterSourceOrDie("triage-filer", &TriageFiler{}) 59 } 60 61 // FileIssues is the main work function of the TriageFiler. It fetches and parses cluster data, 62 // then syncs the top issues to github with the IssueCreator. 63 func (f *TriageFiler) Issues(c *creator.IssueCreator) ([]creator.Issue, error) { 64 f.creator = c 65 rawjson, err := mungerutil.ReadHTTP(clusterDataURL) 66 if err != nil { 67 return nil, err 68 } 69 clusters, err := f.loadClusters(rawjson) 70 if err != nil { 71 return nil, err 72 } 73 topclusters := topClusters(clusters, f.topClustersCount) 74 issues := make([]creator.Issue, 0, len(topclusters)) 75 for _, clust := range topclusters { 76 issues = append(issues, clust) 77 } 78 return issues, nil 79 } 80 81 // RegisterOptions registers options for this munger; returns any that require a restart when changed. 82 func (f *TriageFiler) RegisterFlags() { 83 flag.IntVar(&f.topClustersCount, "triage-count", 3, "The number of clusters to sync issues for on github.") 84 flag.IntVar(&f.windowDays, "triage-window", 1, "The size of the sliding time window (in days) that is used to determine which failures to consider.") 85 } 86 87 // triageData is a struct that represents the format of the JSON triage data and is used for parsing. 88 type triageData struct { 89 Builds struct { 90 Cols struct { 91 Elapsed []int `json:"elapsed"` 92 Executor []string `json:"executor"` 93 PR []string `json:"pr"` 94 Result []string `json:"result"` 95 Started []int64 `json:"started"` 96 TestsFailed []int `json:"tests_failed"` 97 TestsRun []int `json:"tests_run"` 98 } `json:"cols"` 99 JobsRaw map[string]interface{} `json:"jobs"` // []int or map[string]int 100 Jobs map[string]BuildIndexer 101 JobPaths map[string]string `json:"job_paths"` 102 } `json:"builds"` 103 Clustered []*Cluster `json:"clustered"` 104 } 105 106 type Cluster struct { 107 Id string `json:"id"` 108 Key string `json:"key"` 109 Text string `json:"text"` 110 Tests []*Test `json:"tests"` 111 112 filer *TriageFiler 113 jobs map[string][]int 114 totalBuilds int 115 totalJobs int 116 totalTests int 117 } 118 119 type Test struct { 120 Name string `json:"name"` 121 Jobs []*Job `json:"jobs"` 122 } 123 124 type Job struct { 125 Name string `json:"name"` 126 Builds []int `json:"builds"` 127 } 128 129 // filterAndValidate removes failure data that falls outside the time window and ensures that cluster 130 // data is well formed. It also removes data for PR jobs so that only post-submit failures are considered. 131 func (f *TriageFiler) filterAndValidate(windowDays int) error { 132 f.latestStart = int64(0) 133 for _, start := range f.data.Builds.Cols.Started { 134 if start > f.latestStart { 135 f.latestStart = start 136 } 137 } 138 cutoffTime := time.Unix(f.latestStart, 0).AddDate(0, 0, -windowDays).Unix() 139 140 validClusts := []*Cluster{} 141 for clustIndex, clust := range f.data.Clustered { 142 if len(clust.Id) == 0 { 143 return fmt.Errorf("the cluster at index %d in the triage JSON data does not specify an Id.", clustIndex) 144 } 145 if clust.Tests == nil { 146 return fmt.Errorf("cluster '%s' does not have a 'tests' key.", clust.Id) 147 } 148 validTests := []*Test{} 149 for _, test := range clust.Tests { 150 if len(test.Name) == 0 { 151 return fmt.Errorf("cluster '%s' contains a test without a name.", clust.Id) 152 } 153 if test.Jobs == nil { 154 return fmt.Errorf("cluster '%s' does not have a 'jobs' key.", clust.Id) 155 } 156 validJobs := []*Job{} 157 for _, job := range test.Jobs { 158 if len(job.Name) == 0 { 159 return fmt.Errorf("cluster '%s' contains a job without a name under test '%s'.", clust.Id, test.Name) 160 } 161 // Filter out PR jobs 162 if strings.HasPrefix(job.Name, "pr:") { 163 continue 164 } 165 if len(job.Builds) == 0 { 166 return fmt.Errorf("cluster '%s' contains job '%s' under test '%s' with no failing builds.", clust.Id, job.Name, test.Name) 167 } 168 validBuilds := []int{} 169 rowMap, ok := f.data.Builds.Jobs[job.Name] 170 if !ok { 171 return fmt.Errorf("triage json data does not contain buildnum to row index mapping for job '%s'.", job.Name) 172 } 173 for _, buildnum := range job.Builds { 174 row, err := rowMap.rowForBuild(buildnum) 175 if err != nil { 176 return err 177 } 178 if f.data.Builds.Cols.Started[row] > cutoffTime { 179 validBuilds = append(validBuilds, buildnum) 180 } 181 } 182 if len(validBuilds) > 0 { 183 job.Builds = validBuilds 184 validJobs = append(validJobs, job) 185 } 186 } 187 if len(validJobs) > 0 { 188 test.Jobs = validJobs 189 validTests = append(validTests, test) 190 } 191 } 192 if len(validTests) > 0 { 193 clust.Tests = validTests 194 validClusts = append(validClusts, clust) 195 } 196 } 197 f.data.Clustered = validClusts 198 return nil 199 } 200 201 // BuildIndexer is an interface that describes the buildnum to row index mapping used to retrieve data 202 // about individual builds from the JSON file. 203 // This is an interface because the JSON format describing failure clusters has 2 ways of recording the mapping info. 204 type BuildIndexer interface { 205 rowForBuild(buildnum int) (int, error) 206 } 207 208 // ContigIndexer is a BuildIndexer implementation for when the buildnum to row index mapping describes 209 // a contiguous set of rows via 3 ints. 210 type ContigIndexer struct { 211 startRow, startBuild, count int 212 } 213 214 func (rowMap ContigIndexer) rowForBuild(buildnum int) (int, error) { 215 if buildnum < rowMap.startBuild || buildnum > rowMap.startBuild+rowMap.count-1 { 216 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid.", buildnum) 217 } 218 return buildnum - rowMap.startBuild + rowMap.startRow, nil 219 } 220 221 // DictIndexer is a BuildIndexer implementation for when the buildnum to row index mapping is simply a dictionary. 222 // The value type of this dictionary is interface instead of int so that we don't have to convert the original map. 223 type DictIndexer map[string]interface{} 224 225 func (rowMap DictIndexer) rowForBuild(buildnum int) (int, error) { 226 row, ok := rowMap[strconv.Itoa(buildnum)] 227 if !ok { 228 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid.", buildnum) 229 } 230 var irow float64 231 if irow, ok = row.(float64); !ok { 232 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping contains invalid type.", buildnum) 233 } 234 return int(irow), nil 235 } 236 237 // loadClusters parses and filters the json data, then populates every Cluster struct with 238 // aggregated job data and totals. The job data specifies all jobs that failed in a cluster and the 239 // builds that failed for each job, independent of which tests the jobs or builds failed. 240 func (f *TriageFiler) loadClusters(jsonIn []byte) ([]*Cluster, error) { 241 var err error 242 f.data, err = parseTriageData(jsonIn) 243 if err != nil { 244 return nil, err 245 } 246 if err = f.filterAndValidate(f.windowDays); err != nil { 247 return nil, err 248 } 249 250 // Aggregate failing builds in each cluster by job (independent of tests). 251 for _, clust := range f.data.Clustered { 252 clust.filer = f 253 clust.jobs = make(map[string][]int) 254 255 for _, test := range clust.Tests { 256 for _, job := range test.Jobs { 257 for _, buildnum := range job.Builds { 258 found := false 259 for _, oldBuild := range clust.jobs[job.Name] { 260 if oldBuild == buildnum { 261 found = true 262 break 263 } 264 } 265 if !found { 266 clust.jobs[job.Name] = append(clust.jobs[job.Name], buildnum) 267 } 268 } 269 } 270 } 271 clust.totalJobs = len(clust.jobs) 272 clust.totalTests = len(clust.Tests) 273 clust.totalBuilds = 0 274 for _, builds := range clust.jobs { 275 clust.totalBuilds += len(builds) 276 } 277 } 278 return f.data.Clustered, nil 279 } 280 281 // parseTriageData unmarshals raw json data into a triageData struct and creates a BuildIndexer for 282 // every job. 283 func parseTriageData(jsonIn []byte) (*triageData, error) { 284 var data triageData 285 if err := json.Unmarshal(jsonIn, &data); err != nil { 286 return nil, err 287 } 288 289 if data.Builds.Cols.Started == nil { 290 return nil, fmt.Errorf("triage data json is missing the builds.cols.started key.") 291 } 292 if data.Builds.JobsRaw == nil { 293 return nil, fmt.Errorf("triage data is missing the builds.jobs key.") 294 } 295 if data.Builds.JobPaths == nil { 296 return nil, fmt.Errorf("triage data is missing the builds.job_paths key.") 297 } 298 if data.Clustered == nil { 299 return nil, fmt.Errorf("triage data is missing the clustered key.") 300 } 301 // Populate 'Jobs' with the BuildIndexer for each job. 302 data.Builds.Jobs = make(map[string]BuildIndexer) 303 for jobID, mapper := range data.Builds.JobsRaw { 304 switch mapper := mapper.(type) { 305 case []interface{}: 306 // In this case mapper is a 3 member array. 0:first buildnum, 1:number of builds, 2:start index. 307 data.Builds.Jobs[jobID] = ContigIndexer{ 308 startBuild: int(mapper[0].(float64)), 309 count: int(mapper[1].(float64)), 310 startRow: int(mapper[2].(float64)), 311 } 312 case map[string]interface{}: 313 // In this case mapper is a dictionary. 314 data.Builds.Jobs[jobID] = DictIndexer(mapper) 315 default: 316 return nil, fmt.Errorf("the build number to row index mapping for job '%s' is not an accepted type. Type is: %v", jobID, reflect.TypeOf(mapper)) 317 } 318 } 319 return &data, nil 320 } 321 322 // topClusters gets the 'count' most important clusters from a slice of clusters based on number of build failures. 323 func topClusters(clusters []*Cluster, count int) []*Cluster { 324 less := func(i, j int) bool { return clusters[i].totalBuilds > clusters[j].totalBuilds } 325 sort.SliceStable(clusters, less) 326 327 if len(clusters) < count { 328 count = len(clusters) 329 } 330 return clusters[0:count] 331 } 332 333 // topTestsFailing returns the top 'count' test names sorted by number of failing jobs. 334 func (c *Cluster) topTestsFailed(count int) []*Test { 335 less := func(i, j int) bool { return len(c.Tests[i].Jobs) > len(c.Tests[j].Jobs) } 336 sort.SliceStable(c.Tests, less) 337 338 if len(c.Tests) < count { 339 count = len(c.Tests) 340 } 341 return c.Tests[0:count] 342 } 343 344 // topJobsFailed returns the top 'count' job names sorted by number of failing builds. 345 func (c *Cluster) topJobsFailed(count int) []*Job { 346 slice := make([]*Job, len(c.jobs)) 347 i := 0 348 for jobName, builds := range c.jobs { 349 slice[i] = &Job{Name: jobName, Builds: builds} 350 i++ 351 } 352 less := func(i, j int) bool { return len(slice[i].Builds) > len(slice[j].Builds) } 353 sort.SliceStable(slice, less) 354 355 if len(slice) < count { 356 count = len(slice) 357 } 358 return slice[0:count] 359 } 360 361 // Title is the string to use as the github issue title. 362 func (c *Cluster) Title() string { 363 return fmt.Sprintf("Failure cluster [%s...] failed %d builds, %d jobs, and %d tests over %d days", 364 c.Id[0:6], 365 c.totalBuilds, 366 c.totalJobs, 367 c.totalTests, 368 c.filer.windowDays, 369 ) 370 } 371 372 // Body returns the body text of the github issue and *must* contain the output of ID(). 373 // closedIssues is a (potentially empty) slice containing all closed issues authored by this bot 374 // that contain ID() in their body. 375 // If Body returns an empty string no issue is created. 376 func (c *Cluster) Body(closedIssues []*githubapi.Issue) string { 377 // First check that the most recently closed issue (if any exist) was closed 378 // before the start of the sliding window. 379 cutoffTime := time.Unix(c.filer.latestStart, 0).AddDate(0, 0, -c.filer.windowDays) 380 for _, closed := range closedIssues { 381 if closed.ClosedAt.After(cutoffTime) { 382 return "" 383 } 384 } 385 386 var buf bytes.Buffer 387 fmt.Fprintf(&buf, "### Failure cluster [%s](%s#%s)\n", c.ID(), triageURL, c.Id) 388 fmt.Fprintf(&buf, "##### Error text:\n```\n%s\n```\n", c.Text) 389 // cluster stats 390 fmt.Fprint(&buf, "##### Failure cluster statistics:\n") 391 fmt.Fprintf(&buf, "%d tests failed, %d jobs failed, %d builds failed.\n", c.totalTests, c.totalJobs, c.totalBuilds) 392 fmt.Fprintf(&buf, "Failure stats cover %d day time range '%s' to '%s'.\n##### Top failed tests by jobs failed:\n", 393 c.filer.windowDays, 394 cutoffTime.Format(timeFormat), 395 time.Unix(c.filer.latestStart, 0).Format(timeFormat)) 396 // top tests failed 397 fmt.Fprint(&buf, "\n| Test Name | Jobs Failed |\n| --- | --- |\n") 398 for _, test := range c.topTestsFailed(topTestsCount) { 399 fmt.Fprintf(&buf, "| %s | %d |\n", test.Name, len(test.Jobs)) 400 } 401 // top jobs failed 402 fmt.Fprint(&buf, "\n##### Top failed jobs by builds failed:\n") 403 fmt.Fprint(&buf, "\n| Job Name | Builds Failed | Latest Failure |\n| --- | --- | --- |\n") 404 for _, job := range c.topJobsFailed(topJobsCount) { 405 latest := 0 406 latestTime := int64(0) 407 rowMap := c.filer.data.Builds.Jobs[job.Name] 408 for _, build := range job.Builds { 409 row, _ := rowMap.rowForBuild(build) // Already validated start time lookup for all builds. 410 buildTime := c.filer.data.Builds.Cols.Started[row] 411 if buildTime > latestTime { 412 latestTime = buildTime 413 latest = build 414 } 415 } 416 path := strings.TrimPrefix(c.filer.data.Builds.JobPaths[job.Name], "gs://") 417 fmt.Fprintf(&buf, "| %s | %d | [%s](https://k8s-gubernator.appspot.com/build/%s/%d) |\n", job.Name, len(job.Builds), time.Unix(latestTime, 0).Format(timeFormat), path, latest) 418 } 419 // previously closed issues if there are any 420 if len(closedIssues) > 0 { 421 fmt.Fprint(&buf, "\n##### Previously closed issues for this cluster:\n") 422 for _, closed := range closedIssues { 423 fmt.Fprintf(&buf, "#%d ", *closed.Number) 424 } 425 fmt.Fprint(&buf, "\n") 426 } 427 428 // Create /assign command. 429 testNames := make([]string, 0, len(c.Tests)) 430 for _, test := range c.topTestsFailed(len(c.Tests)) { 431 testNames = append(testNames, test.Name) 432 } 433 ownersMap := c.filer.creator.TestsOwners(testNames) 434 if len(ownersMap) > 0 { 435 fmt.Fprint(&buf, "\n/assign") 436 for user := range ownersMap { 437 fmt.Fprintf(&buf, " @%s", user) 438 } 439 fmt.Fprint(&buf, "\n") 440 } 441 442 // Explanations of assignees and sigs 443 fmt.Fprint(&buf, c.filer.creator.ExplainTestAssignments(testNames)) 444 445 fmt.Fprintf(&buf, "\n[Current Status](%s#%s)", triageURL, c.Id) 446 447 return buf.String() 448 } 449 450 // ID yields the string identifier that uniquely identifies this issue. 451 // This ID must appear in the body of the issue. 452 // DO NOT CHANGE how this ID is formatted or duplicate issues may be created on github. 453 func (c *Cluster) ID() string { 454 return c.Id 455 } 456 457 // Labels returns the labels to apply to the issue created for this cluster on github. 458 func (c *Cluster) Labels() []string { 459 labels := []string{"kind/flake"} 460 461 topTests := make([]string, len(c.Tests)) 462 for i, test := range c.topTestsFailed(len(c.Tests)) { 463 topTests[i] = test.Name 464 } 465 for sig := range c.filer.creator.TestsSIGs(topTests) { 466 labels = append(labels, "sig/"+sig) 467 } 468 469 return labels 470 } 471 472 // Owners returns the list of usernames to assign to this issue on github. 473 func (c *Cluster) Owners() []string { 474 // Assign owners by including a /assign command in the body instead of using Owners to set 475 // assignees on the issue request. This lets prow do the assignee validation and will mention 476 // the user we want to assign even if they can't be assigned. 477 return nil 478 } 479 480 // Priority calculates and returns the priority of this issue. 481 // The returned bool indicates if the returned priority is valid and can be used. 482 func (c *Cluster) Priority() (string, bool) { 483 // TODO implement priority calcs later. 484 return "", false 485 }