k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/robots/issue-creator/sources/flakyjob-reporter.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package sources
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/json"
    22  	"flag"
    23  	"fmt"
    24  	"io"
    25  	"net/http"
    26  	"sort"
    27  	"time"
    28  
    29  	"github.com/golang/glog"
    30  
    31  	githubapi "github.com/google/go-github/github"
    32  	"k8s.io/test-infra/robots/issue-creator/creator"
    33  )
    34  
    35  // FlakyJob is a struct that represents a single job and the flake data associated with it.
    36  // FlakyJob implements the Issue interface so that it can be synced with github issues via the IssueCreator.
    37  type FlakyJob struct {
    38  	// Name is the job's name.
    39  	Name string
    40  	// Consistency is the percentage of builds that passed.
    41  	Consistency *float64 `json:"consistency"`
    42  	// FlakeCount is the number of flakes.
    43  	FlakeCount *int `json:"flakes"`
    44  	// FlakyTests is a map of test names to the number of times that test failed.
    45  	// Any test that failed at least once a day for the past week on this job is included.
    46  	FlakyTests map[string]int `json:"flakiest"`
    47  	// testsSorted is a list of the FlakyTests test names sorted by desc. number of flakes.
    48  	// This field is lazily populated and should be accessed via TestsSorted().
    49  	testsSorted []string
    50  
    51  	// reporter is a pointer to the FlakyJobReporter that created this FlakyJob.
    52  	reporter *FlakyJobReporter
    53  }
    54  
    55  // FlakyJobReporter is a munger that creates github issues for the flakiest kubernetes jobs.
    56  // The flakiest jobs are parsed from JSON generated by /test-infra/experiment/bigquery/flakes.sh
    57  type FlakyJobReporter struct {
    58  	flakyJobDataURL string
    59  	syncCount       int
    60  
    61  	creator *creator.IssueCreator
    62  }
    63  
    64  func init() {
    65  	creator.RegisterSourceOrDie("flakyjob-reporter", &FlakyJobReporter{})
    66  }
    67  
    68  // RegisterFlags registers options for this munger; returns any that require a restart when changed.
    69  func (fjr *FlakyJobReporter) RegisterFlags() {
    70  	flag.StringVar(&fjr.flakyJobDataURL, "flakyjob-url", "https://storage.googleapis.com/k8s-metrics/flakes-latest.json", "The url where flaky job JSON data can be found.")
    71  	flag.IntVar(&fjr.syncCount, "flakyjob-count", 3, "The number of flaky jobs to try to sync to github.")
    72  }
    73  
    74  // Issues is the main work method of FlakyJobReporter. It fetches and parses flaky job data,
    75  // then syncs the top issues to github with the IssueCreator.
    76  func (fjr *FlakyJobReporter) Issues(c *creator.IssueCreator) ([]creator.Issue, error) {
    77  	fjr.creator = c
    78  	json, err := ReadHTTP(fjr.flakyJobDataURL)
    79  	if err != nil {
    80  		return nil, err
    81  	}
    82  
    83  	flakyJobs, err := fjr.parseFlakyJobs(json)
    84  	if err != nil {
    85  		return nil, err
    86  	}
    87  
    88  	count := fjr.syncCount
    89  	if len(flakyJobs) < count {
    90  		count = len(flakyJobs)
    91  	}
    92  	issues := make([]creator.Issue, 0, count)
    93  	for _, fj := range flakyJobs[0:count] {
    94  		issues = append(issues, fj)
    95  	}
    96  
    97  	return issues, nil
    98  }
    99  
   100  // parseFlakyJobs parses JSON generated by the 'flakes' bigquery metric into a sorted slice of
   101  // *FlakyJob.
   102  func (fjr *FlakyJobReporter) parseFlakyJobs(jsonIn []byte) ([]*FlakyJob, error) {
   103  	var flakeMap map[string]*FlakyJob
   104  	err := json.Unmarshal(jsonIn, &flakeMap)
   105  	if err != nil || flakeMap == nil {
   106  		return nil, fmt.Errorf("error unmarshaling flaky jobs json: %w", err)
   107  	}
   108  	flakyJobs := make([]*FlakyJob, 0, len(flakeMap))
   109  
   110  	for job, fj := range flakeMap {
   111  		if job == "" {
   112  			glog.Errorf("Flaky jobs json contained a job with an empty jobname.\n")
   113  			continue
   114  		}
   115  		if fj == nil {
   116  			glog.Errorf("Flaky jobs json has invalid data for job '%s'.\n", job)
   117  			continue
   118  		}
   119  		if fj.Consistency == nil {
   120  			glog.Errorf("Flaky jobs json has no 'consistency' field for job '%s'.\n", job)
   121  			continue
   122  		}
   123  		if fj.FlakeCount == nil {
   124  			glog.Errorf("Flaky jobs json has no 'flakes' field for job '%s'.\n", job)
   125  			continue
   126  		}
   127  		if fj.FlakyTests == nil {
   128  			glog.Errorf("Flaky jobs json has no 'flakiest' field for job '%s'.\n", job)
   129  			continue
   130  		}
   131  		fj.Name = job
   132  		fj.reporter = fjr
   133  		flakyJobs = append(flakyJobs, fj)
   134  	}
   135  
   136  	sort.SliceStable(flakyJobs, func(i, j int) bool {
   137  		if *flakyJobs[i].FlakeCount == *flakyJobs[j].FlakeCount {
   138  			return *flakyJobs[i].Consistency < *flakyJobs[j].Consistency
   139  		}
   140  		return *flakyJobs[i].FlakeCount > *flakyJobs[j].FlakeCount
   141  	})
   142  
   143  	return flakyJobs, nil
   144  }
   145  
   146  // TestsSorted returns a slice of the testnames from a FlakyJob's FlakyTests map. The slice is
   147  // sorted by descending number of failures for the tests.
   148  func (fj *FlakyJob) TestsSorted() []string {
   149  	if fj.testsSorted != nil {
   150  		return fj.testsSorted
   151  	}
   152  	fj.testsSorted = make([]string, len(fj.FlakyTests))
   153  	i := 0
   154  	for test := range fj.FlakyTests {
   155  		fj.testsSorted[i] = test
   156  		i++
   157  	}
   158  	sort.SliceStable(fj.testsSorted, func(i, j int) bool {
   159  		return fj.FlakyTests[fj.testsSorted[i]] > fj.FlakyTests[fj.testsSorted[j]]
   160  	})
   161  	return fj.testsSorted
   162  }
   163  
   164  // Title yields the initial title text of the github issue.
   165  func (fj *FlakyJob) Title() string {
   166  	return fmt.Sprintf("%s flaked %d times in the past week", fj.Name, *fj.FlakeCount)
   167  }
   168  
   169  // ID yields the string identifier that uniquely identifies this issue.
   170  // This ID must appear in the body of the issue.
   171  // DO NOT CHANGE how this ID is formatted or duplicate issues may be created on github.
   172  func (fj *FlakyJob) ID() string {
   173  	return fmt.Sprintf("Flaky Job: %s", fj.Name)
   174  }
   175  
   176  // Body returns the body text of the github issue and *must* contain the output of ID().
   177  // closedIssues is a (potentially empty) slice containing all closed issues authored by this bot
   178  // that contain ID() in their body.
   179  // If Body returns an empty string no issue is created.
   180  func (fj *FlakyJob) Body(closedIssues []*githubapi.Issue) string {
   181  	// First check that the most recently closed issue (if any exist) was closed
   182  	// at least a week ago (since that is the sliding window size used by the flake metric).
   183  	cutoffTime := time.Now().AddDate(0, 0, -7)
   184  	for _, closed := range closedIssues {
   185  		if closed.ClosedAt.After(cutoffTime) {
   186  			return ""
   187  		}
   188  	}
   189  
   190  	// Print stats about the flaky job.
   191  	var buf bytes.Buffer
   192  	fmt.Fprintf(&buf, "### %s\n Flakes in the past week: **%d**\n Consistency: **%.2f%%**\n",
   193  		fj.ID(), *fj.FlakeCount, *fj.Consistency*100)
   194  	if len(fj.FlakyTests) > 0 {
   195  		fmt.Fprint(&buf, "\n#### Flakiest tests by flake count:\n| Test | Flake Count |\n| --- | --- |\n")
   196  		for _, testName := range fj.TestsSorted() {
   197  			fmt.Fprintf(&buf, "| %s | %d |\n", testName, fj.FlakyTests[testName])
   198  		}
   199  	}
   200  	// List previously closed issues if there are any.
   201  	if len(closedIssues) > 0 {
   202  		fmt.Fprint(&buf, "\n#### Previously closed issues for this job flaking:\n")
   203  		for _, closed := range closedIssues {
   204  			fmt.Fprintf(&buf, "#%d ", *closed.Number)
   205  		}
   206  		fmt.Fprint(&buf, "\n")
   207  	}
   208  
   209  	// Create /assign command.
   210  	testsSorted := fj.TestsSorted()
   211  	ownersMap := fj.reporter.creator.TestsOwners(testsSorted)
   212  	if len(ownersMap) > 0 {
   213  		fmt.Fprint(&buf, "\n/assign")
   214  		for user := range ownersMap {
   215  			fmt.Fprintf(&buf, " @%s", user)
   216  		}
   217  		fmt.Fprint(&buf, "\n")
   218  	}
   219  
   220  	// Explain why assignees were assigned and why sig labels were applied.
   221  	fmt.Fprintf(&buf, "\n%s", fj.reporter.creator.ExplainTestAssignments(testsSorted))
   222  
   223  	fmt.Fprintf(&buf, "\n[Flakiest Jobs](%s)\n", fj.reporter.flakyJobDataURL)
   224  
   225  	fmt.Fprintf(&buf, "\n/kind flake\n")
   226  
   227  	return buf.String()
   228  }
   229  
   230  // Labels returns the labels to apply to the issue created for this flaky job on github.
   231  func (fj *FlakyJob) Labels() []string {
   232  	labels := []string{"kind/flake"}
   233  	// get sig labels
   234  	for sig := range fj.reporter.creator.TestsSIGs(fj.TestsSorted()) {
   235  		labels = append(labels, "sig/"+sig)
   236  	}
   237  	return labels
   238  }
   239  
   240  // Owners returns the list of usernames to assign to this issue on github.
   241  func (fj *FlakyJob) Owners() []string {
   242  	// Assign owners by including a /assign command in the body instead of using Owners to set
   243  	// assignees on the issue request. This lets prow do the assignee validation and will mention
   244  	// the user we want to assign even if they can't be assigned.
   245  	return nil
   246  }
   247  
   248  // Priority calculates and returns the priority of this issue
   249  // The returned bool indicates if the returned priority is valid and can be used
   250  func (fj *FlakyJob) Priority() (string, bool) {
   251  	// TODO: implement priority calculations later
   252  	return "", false
   253  }
   254  
   255  // ReadHTTP fetches file contents from a URL with retries.
   256  func ReadHTTP(url string) ([]byte, error) {
   257  	var err error
   258  	retryDelay := time.Duration(2) * time.Second
   259  	for retryCount := 0; retryCount < 5; retryCount++ {
   260  		if retryCount > 0 {
   261  			time.Sleep(retryDelay)
   262  			retryDelay *= time.Duration(2)
   263  		}
   264  
   265  		resp, err := http.Get(url)
   266  		if resp != nil && resp.StatusCode >= 500 {
   267  			// Retry on this type of error.
   268  			continue
   269  		}
   270  		if err != nil {
   271  			return nil, err
   272  		}
   273  		defer resp.Body.Close()
   274  
   275  		body, err := io.ReadAll(resp.Body)
   276  		if err != nil {
   277  			continue
   278  		}
   279  		return body, nil
   280  	}
   281  	return nil, fmt.Errorf("ran out of retries reading from '%s'. Last error was %w", url, err)
   282  }