k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/robots/issue-creator/sources/triage-filer.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package sources 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "flag" 23 "fmt" 24 "reflect" 25 "sort" 26 "strconv" 27 "strings" 28 "time" 29 30 githubapi "github.com/google/go-github/github" 31 "k8s.io/test-infra/robots/issue-creator/creator" 32 ) 33 34 const ( 35 timeFormat = "2 Jan 2006 15:04 MST" 36 37 // Configuration constants. 38 topJobsCount = 3 39 topTestsCount = 3 40 triageURL = "https://go.k8s.io/triage" 41 clusterDataURL = "https://storage.googleapis.com/k8s-gubernator/triage/failure_data.json" 42 ) 43 44 // TriageFiler files issues for clustered test failures. 45 type TriageFiler struct { 46 topClustersCount int 47 windowDays int 48 49 latestStart int64 50 51 creator *creator.IssueCreator 52 data *triageData 53 } 54 55 func init() { 56 creator.RegisterSourceOrDie("triage-filer", &TriageFiler{}) 57 } 58 59 // Issues is the main work function of the TriageFiler. It fetches and parses cluster data, 60 // then syncs the top issues to github with the IssueCreator. 61 func (f *TriageFiler) Issues(c *creator.IssueCreator) ([]creator.Issue, error) { 62 f.creator = c 63 rawjson, err := ReadHTTP(clusterDataURL) 64 if err != nil { 65 return nil, err 66 } 67 clusters, err := f.loadClusters(rawjson) 68 if err != nil { 69 return nil, err 70 } 71 topclusters := topClusters(clusters, f.topClustersCount) 72 issues := make([]creator.Issue, 0, len(topclusters)) 73 for _, clust := range topclusters { 74 issues = append(issues, clust) 75 } 76 return issues, nil 77 } 78 79 // RegisterFlags registers options for this munger; returns any that require a restart when changed. 80 func (f *TriageFiler) RegisterFlags() { 81 flag.IntVar(&f.topClustersCount, "triage-count", 3, "The number of clusters to sync issues for on github.") 82 flag.IntVar(&f.windowDays, "triage-window", 1, "The size of the sliding time window (in days) that is used to determine which failures to consider.") 83 } 84 85 // triageData is a struct that represents the format of the JSON triage data and is used for parsing. 86 type triageData struct { 87 Builds struct { 88 Cols struct { 89 Elapsed []int `json:"elapsed"` 90 Executor []string `json:"executor"` 91 PR []string `json:"pr"` 92 Result []string `json:"result"` 93 Started []int64 `json:"started"` 94 TestsFailed []int `json:"tests_failed"` 95 TestsRun []int `json:"tests_run"` 96 } `json:"cols"` 97 JobsRaw map[string]interface{} `json:"jobs"` // []int or map[string]int 98 Jobs map[string]BuildIndexer 99 JobPaths map[string]string `json:"job_paths"` 100 } `json:"builds"` 101 Clustered []*Cluster `json:"clustered"` 102 } 103 104 // Cluster holds information about a failure cluster. 105 type Cluster struct { 106 Identifier string `json:"id"` 107 Key string `json:"key"` 108 Text string `json:"text"` 109 Tests []*Test `json:"tests"` 110 111 filer *TriageFiler 112 jobs map[string][]int 113 totalBuilds int 114 totalJobs int 115 totalTests int 116 } 117 118 // Test holds a name and list of jobs 119 type Test struct { 120 Name string `json:"name"` 121 Jobs []*Job `json:"jobs"` 122 } 123 124 // Job holds a name and list of build numbers 125 type Job struct { 126 Name string `json:"name"` 127 Builds []int `json:"builds"` 128 } 129 130 // filterAndValidate removes failure data that falls outside the time window and ensures that cluster 131 // data is well formed. It also removes data for PR jobs so that only post-submit failures are considered. 132 func (f *TriageFiler) filterAndValidate(windowDays int) error { 133 f.latestStart = int64(0) 134 for _, start := range f.data.Builds.Cols.Started { 135 if start > f.latestStart { 136 f.latestStart = start 137 } 138 } 139 cutoffTime := time.Unix(f.latestStart, 0).AddDate(0, 0, -windowDays).Unix() 140 141 validClusts := []*Cluster{} 142 for clustIndex, clust := range f.data.Clustered { 143 if len(clust.Identifier) == 0 { 144 return fmt.Errorf("the cluster at index %d in the triage JSON data does not specify an ID", clustIndex) 145 } 146 if clust.Tests == nil { 147 return fmt.Errorf("cluster '%s' does not have a 'tests' key", clust.Identifier) 148 } 149 validTests := []*Test{} 150 for _, test := range clust.Tests { 151 if len(test.Name) == 0 { 152 return fmt.Errorf("cluster '%s' contains a test without a name", clust.Identifier) 153 } 154 if test.Jobs == nil { 155 return fmt.Errorf("cluster '%s' does not have a 'jobs' key", clust.Identifier) 156 } 157 validJobs := []*Job{} 158 for _, job := range test.Jobs { 159 if len(job.Name) == 0 { 160 return fmt.Errorf("cluster '%s' contains a job without a name under test '%s'", clust.Identifier, test.Name) 161 } 162 // Filter out PR jobs 163 if strings.HasPrefix(job.Name, "pr:") { 164 continue 165 } 166 if len(job.Builds) == 0 { 167 return fmt.Errorf("cluster '%s' contains job '%s' under test '%s' with no failing builds", clust.Identifier, job.Name, test.Name) 168 } 169 validBuilds := []int{} 170 rowMap, ok := f.data.Builds.Jobs[job.Name] 171 if !ok { 172 return fmt.Errorf("triage json data does not contain buildnum to row index mapping for job '%s'", job.Name) 173 } 174 for _, buildnum := range job.Builds { 175 row, err := rowMap.rowForBuild(buildnum) 176 if err != nil { 177 return err 178 } 179 if f.data.Builds.Cols.Started[row] > cutoffTime { 180 validBuilds = append(validBuilds, buildnum) 181 } 182 } 183 if len(validBuilds) > 0 { 184 job.Builds = validBuilds 185 validJobs = append(validJobs, job) 186 } 187 } 188 if len(validJobs) > 0 { 189 test.Jobs = validJobs 190 validTests = append(validTests, test) 191 } 192 } 193 if len(validTests) > 0 { 194 clust.Tests = validTests 195 validClusts = append(validClusts, clust) 196 } 197 } 198 f.data.Clustered = validClusts 199 return nil 200 } 201 202 // BuildIndexer is an interface that describes the buildnum to row index mapping used to retrieve data 203 // about individual builds from the JSON file. 204 // This is an interface because the JSON format describing failure clusters has 2 ways of recording the mapping info. 205 type BuildIndexer interface { 206 rowForBuild(buildnum int) (int, error) 207 } 208 209 // ContigIndexer is a BuildIndexer implementation for when the buildnum to row index mapping describes 210 // a contiguous set of rows via 3 ints. 211 type ContigIndexer struct { 212 startRow, startBuild, count int 213 } 214 215 func (rowMap ContigIndexer) rowForBuild(buildnum int) (int, error) { 216 if buildnum < rowMap.startBuild || buildnum > rowMap.startBuild+rowMap.count-1 { 217 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum) 218 } 219 return buildnum - rowMap.startBuild + rowMap.startRow, nil 220 } 221 222 // DictIndexer is a BuildIndexer implementation for when the buildnum to row index mapping is simply a dictionary. 223 // The value type of this dictionary is interface instead of int so that we don't have to convert the original map. 224 type DictIndexer map[string]interface{} 225 226 func (rowMap DictIndexer) rowForBuild(buildnum int) (int, error) { 227 row, ok := rowMap[strconv.Itoa(buildnum)] 228 if !ok { 229 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping or buildnumber is invalid", buildnum) 230 } 231 var irow float64 232 if irow, ok = row.(float64); !ok { 233 return 0, fmt.Errorf("failed to find row in JSON for buildnumber: %d. Row mapping contains invalid type", buildnum) 234 } 235 return int(irow), nil 236 } 237 238 // loadClusters parses and filters the json data, then populates every Cluster struct with 239 // aggregated job data and totals. The job data specifies all jobs that failed in a cluster and the 240 // builds that failed for each job, independent of which tests the jobs or builds failed. 241 func (f *TriageFiler) loadClusters(jsonIn []byte) ([]*Cluster, error) { 242 var err error 243 f.data, err = parseTriageData(jsonIn) 244 if err != nil { 245 return nil, err 246 } 247 if err = f.filterAndValidate(f.windowDays); err != nil { 248 return nil, err 249 } 250 251 // Aggregate failing builds in each cluster by job (independent of tests). 252 for _, clust := range f.data.Clustered { 253 clust.filer = f 254 clust.jobs = make(map[string][]int) 255 256 for _, test := range clust.Tests { 257 for _, job := range test.Jobs { 258 for _, buildnum := range job.Builds { 259 found := false 260 for _, oldBuild := range clust.jobs[job.Name] { 261 if oldBuild == buildnum { 262 found = true 263 break 264 } 265 } 266 if !found { 267 clust.jobs[job.Name] = append(clust.jobs[job.Name], buildnum) 268 } 269 } 270 } 271 } 272 clust.totalJobs = len(clust.jobs) 273 clust.totalTests = len(clust.Tests) 274 clust.totalBuilds = 0 275 for _, builds := range clust.jobs { 276 clust.totalBuilds += len(builds) 277 } 278 } 279 return f.data.Clustered, nil 280 } 281 282 // parseTriageData unmarshals raw json data into a triageData struct and creates a BuildIndexer for 283 // every job. 284 func parseTriageData(jsonIn []byte) (*triageData, error) { 285 var data triageData 286 if err := json.Unmarshal(jsonIn, &data); err != nil { 287 return nil, err 288 } 289 290 if data.Builds.Cols.Started == nil { 291 return nil, fmt.Errorf("triage data json is missing the builds.cols.started key") 292 } 293 if data.Builds.JobsRaw == nil { 294 return nil, fmt.Errorf("triage data is missing the builds.jobs key") 295 } 296 if data.Builds.JobPaths == nil { 297 return nil, fmt.Errorf("triage data is missing the builds.job_paths key") 298 } 299 if data.Clustered == nil { 300 return nil, fmt.Errorf("triage data is missing the clustered key") 301 } 302 // Populate 'Jobs' with the BuildIndexer for each job. 303 data.Builds.Jobs = make(map[string]BuildIndexer) 304 for jobID, mapper := range data.Builds.JobsRaw { 305 switch mapper := mapper.(type) { 306 case []interface{}: 307 // In this case mapper is a 3 member array. 0:first buildnum, 1:number of builds, 2:start index. 308 data.Builds.Jobs[jobID] = ContigIndexer{ 309 startBuild: int(mapper[0].(float64)), 310 count: int(mapper[1].(float64)), 311 startRow: int(mapper[2].(float64)), 312 } 313 case map[string]interface{}: 314 // In this case mapper is a dictionary. 315 data.Builds.Jobs[jobID] = DictIndexer(mapper) 316 default: 317 return nil, fmt.Errorf("the build number to row index mapping for job '%s' is not an accepted type. Type is: %v", jobID, reflect.TypeOf(mapper)) 318 } 319 } 320 return &data, nil 321 } 322 323 // topClusters gets the 'count' most important clusters from a slice of clusters based on number of build failures. 324 func topClusters(clusters []*Cluster, count int) []*Cluster { 325 less := func(i, j int) bool { return clusters[i].totalBuilds > clusters[j].totalBuilds } 326 sort.SliceStable(clusters, less) 327 328 if len(clusters) < count { 329 count = len(clusters) 330 } 331 return clusters[0:count] 332 } 333 334 // topTestsFailing returns the top 'count' test names sorted by number of failing jobs. 335 func (c *Cluster) topTestsFailed(count int) []*Test { 336 less := func(i, j int) bool { return len(c.Tests[i].Jobs) > len(c.Tests[j].Jobs) } 337 sort.SliceStable(c.Tests, less) 338 339 if len(c.Tests) < count { 340 count = len(c.Tests) 341 } 342 return c.Tests[0:count] 343 } 344 345 // topJobsFailed returns the top 'count' job names sorted by number of failing builds. 346 func (c *Cluster) topJobsFailed(count int) []*Job { 347 slice := make([]*Job, len(c.jobs)) 348 i := 0 349 for jobName, builds := range c.jobs { 350 slice[i] = &Job{Name: jobName, Builds: builds} 351 i++ 352 } 353 less := func(i, j int) bool { return len(slice[i].Builds) > len(slice[j].Builds) } 354 sort.SliceStable(slice, less) 355 356 if len(slice) < count { 357 count = len(slice) 358 } 359 return slice[0:count] 360 } 361 362 // Title is the string to use as the github issue title. 363 func (c *Cluster) Title() string { 364 return fmt.Sprintf("Failure cluster [%s...] failed %d builds, %d jobs, and %d tests over %d days", 365 c.Identifier[0:6], 366 c.totalBuilds, 367 c.totalJobs, 368 c.totalTests, 369 c.filer.windowDays, 370 ) 371 } 372 373 // Body returns the body text of the github issue and *must* contain the output of ID(). 374 // closedIssues is a (potentially empty) slice containing all closed issues authored by this bot 375 // that contain ID() in their body. 376 // If Body returns an empty string no issue is created. 377 func (c *Cluster) Body(closedIssues []*githubapi.Issue) string { 378 // First check that the most recently closed issue (if any exist) was closed 379 // before the start of the sliding window. 380 cutoffTime := time.Unix(c.filer.latestStart, 0).AddDate(0, 0, -c.filer.windowDays) 381 for _, closed := range closedIssues { 382 if closed.ClosedAt.After(cutoffTime) { 383 return "" 384 } 385 } 386 387 var buf bytes.Buffer 388 fmt.Fprintf(&buf, "### Failure cluster [%s](%s#%s)\n", c.ID(), triageURL, c.Identifier) 389 fmt.Fprintf(&buf, "##### Error text:\n```\n%s\n```\n", c.Text) 390 // cluster stats 391 fmt.Fprint(&buf, "##### Failure cluster statistics:\n") 392 fmt.Fprintf(&buf, "%d tests failed, %d jobs failed, %d builds failed.\n", c.totalTests, c.totalJobs, c.totalBuilds) 393 fmt.Fprintf(&buf, "Failure stats cover %d day time range '%s' to '%s'.\n##### Top failed tests by jobs failed:\n", 394 c.filer.windowDays, 395 cutoffTime.Format(timeFormat), 396 time.Unix(c.filer.latestStart, 0).Format(timeFormat)) 397 // top tests failed 398 fmt.Fprint(&buf, "\n| Test Name | Jobs Failed |\n| --- | --- |\n") 399 for _, test := range c.topTestsFailed(topTestsCount) { 400 fmt.Fprintf(&buf, "| %s | %d |\n", test.Name, len(test.Jobs)) 401 } 402 // top jobs failed 403 fmt.Fprint(&buf, "\n##### Top failed jobs by builds failed:\n") 404 fmt.Fprint(&buf, "\n| Job Name | Builds Failed | Latest Failure |\n| --- | --- | --- |\n") 405 for _, job := range c.topJobsFailed(topJobsCount) { 406 latest := 0 407 latestTime := int64(0) 408 rowMap := c.filer.data.Builds.Jobs[job.Name] 409 for _, build := range job.Builds { 410 row, _ := rowMap.rowForBuild(build) // Already validated start time lookup for all builds. 411 buildTime := c.filer.data.Builds.Cols.Started[row] 412 if buildTime > latestTime { 413 latestTime = buildTime 414 latest = build 415 } 416 } 417 path := strings.TrimPrefix(c.filer.data.Builds.JobPaths[job.Name], "gs://") 418 fmt.Fprintf(&buf, "| %s | %d | [%s](https://prow.k8s.io/view/gs/%s/%d) |\n", job.Name, len(job.Builds), time.Unix(latestTime, 0).Format(timeFormat), path, latest) 419 } 420 // previously closed issues if there are any 421 if len(closedIssues) > 0 { 422 fmt.Fprint(&buf, "\n##### Previously closed issues for this cluster:\n") 423 for _, closed := range closedIssues { 424 fmt.Fprintf(&buf, "#%d ", *closed.Number) 425 } 426 fmt.Fprint(&buf, "\n") 427 } 428 429 // Create /assign command. 430 testNames := make([]string, 0, len(c.Tests)) 431 for _, test := range c.topTestsFailed(len(c.Tests)) { 432 testNames = append(testNames, test.Name) 433 } 434 ownersMap := c.filer.creator.TestsOwners(testNames) 435 if len(ownersMap) > 0 { 436 fmt.Fprint(&buf, "\n/assign") 437 for user := range ownersMap { 438 fmt.Fprintf(&buf, " @%s", user) 439 } 440 fmt.Fprint(&buf, "\n") 441 } 442 443 // Explanations of assignees and sigs 444 fmt.Fprint(&buf, c.filer.creator.ExplainTestAssignments(testNames)) 445 446 fmt.Fprintf(&buf, "\n[Current Status](%s#%s)", triageURL, c.Identifier) 447 448 return buf.String() 449 } 450 451 // ID yields the string identifier that uniquely identifies this issue. 452 // This ID must appear in the body of the issue. 453 // DO NOT CHANGE how this ID is formatted or duplicate issues may be created on github. 454 func (c *Cluster) ID() string { 455 return c.Identifier 456 } 457 458 // Labels returns the labels to apply to the issue created for this cluster on github. 459 func (c *Cluster) Labels() []string { 460 labels := []string{"kind/flake"} 461 462 topTests := make([]string, len(c.Tests)) 463 for i, test := range c.topTestsFailed(len(c.Tests)) { 464 topTests[i] = test.Name 465 } 466 for sig := range c.filer.creator.TestsSIGs(topTests) { 467 labels = append(labels, "sig/"+sig) 468 } 469 470 return labels 471 } 472 473 // Owners returns the list of usernames to assign to this issue on github. 474 func (c *Cluster) Owners() []string { 475 // Assign owners by including a /assign command in the body instead of using Owners to set 476 // assignees on the issue request. This lets prow do the assignee validation and will mention 477 // the user we want to assign even if they can't be assigned. 478 return nil 479 } 480 481 // Priority calculates and returns the priority of this issue. 482 // The returned bool indicates if the returned priority is valid and can be used. 483 func (c *Cluster) Priority() (string, bool) { 484 // TODO implement priority calcs later. 485 return "", false 486 }