github.com/aclements/go-misc@v0.0.0-20240129233631-2f6ede80790c/findflakes/main.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"flag"
     9  	"log"
    10  	"os"
    11  	"path/filepath"
    12  	"regexp"
    13  	"runtime"
    14  	"runtime/pprof"
    15  	"sort"
    16  	"strings"
    17  	"sync"
    18  
    19  	"github.com/aclements/go-misc/internal/loganal"
    20  )
    21  
    22  var (
    23  	flagRevDir = flag.String("dir", defaultRevDir(), "search logs under `directory`")
    24  	flagBranch = flag.String("branch", "master", "analyze commits to `branch`")
    25  	flagHTML   = flag.Bool("html", false, "print an HTML report")
    26  	flagLimit  = flag.Int("limit", 0, "process only most recent `N` revisions")
    27  
    28  	// TODO: Is this really just a separate mode? Should we have
    29  	// subcommands?
    30  	flagGrep  = flag.String("grep", "", "show analysis for logs matching `regexp`")
    31  	flagPaths = flag.Bool("paths", false, "read dir-relative paths of logs with failures from stdin (useful with greplogs -l)")
    32  )
    33  
    34  func defaultRevDir() string {
    35  	return filepath.Join(xdgCacheDir(), "fetchlogs", "rev")
    36  }
    37  
    38  // TODO: Tool you can point at a failure log to annotate each failure
    39  // in the log with links to past instances of that failure. This just
    40  // uses log analysis.
    41  
    42  // TODO: If we were careful about merges, we could potentially use
    43  // information from other branches to add additional samples between
    44  // merge points.
    45  
    46  // TODO: Consider each build a separate event, rather than each
    47  // revision. It doesn't matter what "order" they're in, though we
    48  // should randomize it for each revision. History subdivision should
    49  // only happen on revision boundaries.
    50  //
    51  // OTOH, this makes deterministic failures on specific
    52  // OSs/architectures looks like non-deterministic failures.
    53  //
    54  // This would also mean it's more important to identify builds in
    55  // which a test wasn't even executed (e.g., because an earlier test
    56  // failed) so we don't count those as "successes". OTOH, it may be
    57  // sufficient to consider a test executed unless we see a failure in
    58  // that test or that build didn't happen (e.g., a gap in the history).
    59  //
    60  // This would also help with fixing the problem where hard build
    61  // failures are considered successes of all tests.
    62  //
    63  // This would also be more sound when builders are added at some point
    64  // in the history. If the probability of a failure is really constant
    65  // per build, adding a builder will increase the probability of seeing
    66  // the failure at the commit level. Of course, if it's conditional on
    67  // OS or architecture or builder, this will make it look *less* likely
    68  // at the build level.
    69  //
    70  // Along these lines, the culprit analysis should have the property
    71  // that do more runs around possible culprit commits should improve
    72  // the fidelity of the culprit results.
    73  
    74  // TODO: Support pointing this at a set of stress test failures (along
    75  // with the count of total runs, I guess) and having it classify and
    76  // report failures. In this case there's no order or commit sequence
    77  // involved, so there's no time series analysis or
    78  // first/last/culprits, but the classification and failure probability
    79  // are still useful.
    80  //
    81  // It also makes sense to point this at a stress test of a sequence of
    82  // commits, in which case the culprit analysis is still useful. This
    83  // probably integrates well with the previous TODO of considering each
    84  // build a separate event, and it's closely related to the problem of
    85  // new builders being added.
    86  
    87  func main() {
    88  	var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
    89  
    90  	flag.Parse()
    91  	if flag.NArg() > 0 {
    92  		flag.Usage()
    93  		os.Exit(2)
    94  	}
    95  
    96  	if *cpuprofile != "" {
    97  		f, err := os.Create(*cpuprofile)
    98  		if err != nil {
    99  			log.Fatal(err)
   100  		}
   101  		pprof.StartCPUProfile(f)
   102  		defer pprof.StopCPUProfile()
   103  	}
   104  
   105  	allRevs, err := LoadRevisions(*flagRevDir)
   106  	if err != nil {
   107  		log.Fatal(err)
   108  	}
   109  
   110  	// Filter to revisions on this branch.
   111  	revs := []*Revision{}
   112  	for _, rev := range allRevs {
   113  		if rev.Branch == *flagBranch {
   114  			revs = append(revs, rev)
   115  		}
   116  	}
   117  	if len(revs) == 0 {
   118  		log.Fatal("no revisions found")
   119  	}
   120  
   121  	// Limit to most recent N revisions.
   122  	if *flagLimit > 0 && len(revs) > *flagLimit {
   123  		revs = revs[len(revs)-*flagLimit:]
   124  	}
   125  
   126  	if *flagGrep != "" {
   127  		// Grep mode.
   128  		re, err := regexp.Compile(*flagGrep)
   129  		if err != nil {
   130  			log.Fatal(err)
   131  		}
   132  		failures := grepFailures(revs, re)
   133  		if len(failures) == 0 {
   134  			return
   135  		}
   136  		fc := newFailureClass(revs, failures)
   137  		printTextFlakeReport(os.Stdout, fc)
   138  		return
   139  	}
   140  
   141  	if *flagPaths {
   142  		// Paths mode.
   143  		paths, err := readPaths(os.Stdin)
   144  		if err != nil {
   145  			log.Fatal(err)
   146  		}
   147  		failures := pathFailures(revs, paths)
   148  		if len(failures) == 0 {
   149  			return
   150  		}
   151  		fc := newFailureClass(revs, failures)
   152  		printTextFlakeReport(os.Stdout, fc)
   153  		return
   154  	}
   155  
   156  	// Extract failures from logs.
   157  	failures := extractFailures(revs)
   158  
   159  	// Classify failures.
   160  	lfailures := make([]*loganal.Failure, len(failures))
   161  	for i, f := range failures {
   162  		lfailures[i] = f.Failure
   163  	}
   164  	failureClasses := loganal.Classify(lfailures)
   165  
   166  	// Gather failures from each class and perform flakiness
   167  	// tests.
   168  	classes := []*failureClass{}
   169  	for class, indexes := range failureClasses {
   170  		classFailures := []*failure{}
   171  		for _, fi := range indexes {
   172  			classFailures = append(classFailures, failures[fi])
   173  		}
   174  		fc := newFailureClass(revs, classFailures)
   175  		fc.Class = class
   176  
   177  		// Trim failure classes below thresholds. We leave out
   178  		// classes with extremely low failure probabilities
   179  		// because the chance that these are still happening
   180  		// takes a long time to decay and there's almost
   181  		// nothing we can do for culprit analysis.
   182  		if fc.Current < 0.05 || fc.Latest.FailureProbability < 0.01 {
   183  			continue
   184  		}
   185  
   186  		classes = append(classes, fc)
   187  	}
   188  
   189  	// Sort failure classes by likelihood that failure is still
   190  	// happening.
   191  	sort.Sort(sort.Reverse(currentSorter(classes)))
   192  
   193  	if *flagHTML {
   194  		printHTMLReport(os.Stdout, classes)
   195  	} else {
   196  		printTextReport(os.Stdout, classes)
   197  	}
   198  }
   199  
   200  func processFailureLogs(revs []*Revision, process func(build *Build, data []byte) []*failure) []*failure {
   201  	// Create log processing tasks.
   202  	type Task struct {
   203  		t     int
   204  		build *Build
   205  		res   []*failure
   206  	}
   207  	tasks := []Task{}
   208  	for t, rev := range revs {
   209  		for _, build := range rev.Builds {
   210  			if build.Status != BuildFailed {
   211  				continue
   212  			}
   213  			tasks = append(tasks, Task{t, build, nil})
   214  		}
   215  	}
   216  
   217  	// Run failure processing.
   218  	todo := make(chan int)
   219  	go func() {
   220  		for i := range tasks {
   221  			todo <- i
   222  		}
   223  		close(todo)
   224  	}()
   225  	var wg sync.WaitGroup
   226  	for i := 0; i < 4*runtime.GOMAXPROCS(-1); i++ {
   227  		wg.Add(1)
   228  		go func() {
   229  			for i := range todo {
   230  				task := tasks[i]
   231  
   232  				data, err := task.build.ReadLog()
   233  				if err != nil {
   234  					log.Fatal(err)
   235  				}
   236  
   237  				failures := process(task.build, data)
   238  
   239  				// Fill build-related fields.
   240  				for _, failure := range failures {
   241  					failure.T = task.t
   242  					failure.CommitsAgo = len(revs) - task.t - 1
   243  					failure.Rev = revs[task.t]
   244  					failure.Build = task.build
   245  				}
   246  				tasks[i].res = failures
   247  			}
   248  			wg.Done()
   249  		}()
   250  	}
   251  	wg.Wait()
   252  
   253  	// Gather results.
   254  	failures := []*failure{}
   255  	for _, task := range tasks {
   256  		failures = append(failures, task.res...)
   257  	}
   258  	return failures
   259  }
   260  
   261  func extractFailures(revs []*Revision) []*failure {
   262  	return processFailureLogs(revs, func(build *Build, data []byte) []*failure {
   263  		// TODO: OS/Arch
   264  		lfailures, err := loganal.Extract(string(data), "", "")
   265  		if err != nil {
   266  			log.Printf("%s: %v\n", build.LogPath(), err)
   267  			return nil
   268  		}
   269  		if len(lfailures) == 0 {
   270  			return nil
   271  		}
   272  
   273  		failures := make([]*failure, 0, len(lfailures))
   274  		for _, lf := range lfailures {
   275  			// Ignore build failures.
   276  			//
   277  			// TODO: This has the effect of counting these
   278  			// as successes for all tests. In the best
   279  			// case, this cuts down on the number of
   280  			// samples per revision. If we have an
   281  			// across-the-board build failure, it will
   282  			// drive down the failure rates of all
   283  			// failures and may even effect timeline
   284  			// subdivision.
   285  			if strings.Contains(lf.Message, "build failed") {
   286  				continue
   287  			}
   288  
   289  			failures = append(failures, &failure{
   290  				Failure: lf,
   291  			})
   292  		}
   293  		return failures
   294  	})
   295  }
   296  
   297  func grepFailures(revs []*Revision, re *regexp.Regexp) []*failure {
   298  	return processFailureLogs(revs, func(build *Build, data []byte) []*failure {
   299  		if !re.Match(data) {
   300  			return nil
   301  		}
   302  		return []*failure{new(failure)}
   303  	})
   304  }
   305  
   306  type failure struct {
   307  	*loganal.Failure
   308  
   309  	T          int
   310  	CommitsAgo int
   311  	Rev        *Revision
   312  	Build      *Build
   313  }
   314  
   315  type failureClass struct {
   316  	// Class gives the common features of this failure class.
   317  	Class loganal.Failure
   318  
   319  	// Revs is the sequence of all revisions indexed by time (both
   320  	// success and failure).
   321  	Revs []*Revision
   322  
   323  	// Failures is a slice of all failures, by order of increasing
   324  	// time T. Note that there may be more than one failure at the
   325  	// same time T.
   326  	Failures []*failure
   327  
   328  	// Test is the results of the flake test for this failure
   329  	// class.
   330  	Test *FlakeTestResult
   331  
   332  	// Latest is the latest flake region (Test.All[0]).
   333  	Latest *FlakeRegion
   334  
   335  	// Current is the probability that this failure is still
   336  	// happening.
   337  	Current float64
   338  }
   339  
   340  func newFailureClass(revs []*Revision, failures []*failure) *failureClass {
   341  	fc := failureClass{
   342  		Revs:     revs,
   343  		Failures: failures,
   344  	}
   345  	times := []int{}
   346  	for i, f := range failures {
   347  		t := f.T
   348  		if i == 0 || times[len(times)-1] != t {
   349  			times = append(times, t)
   350  		}
   351  	}
   352  	fc.Test = FlakeTest(times)
   353  	fc.Latest = &fc.Test.All[0]
   354  	fc.Current = fc.Latest.StillHappening(len(revs) - 1)
   355  	return &fc
   356  }
   357  
   358  type currentSorter []*failureClass
   359  
   360  func (s currentSorter) Len() int {
   361  	return len(s)
   362  }
   363  
   364  func (s currentSorter) Less(i, j int) bool {
   365  	if s[i].Current != s[j].Current {
   366  		return s[i].Current < s[j].Current
   367  	}
   368  	if s[i].Latest.FailureProbability != s[j].Latest.FailureProbability {
   369  		return s[i].Latest.FailureProbability < s[j].Latest.FailureProbability
   370  	}
   371  	return s[i].Class.String() < s[j].Class.String()
   372  }
   373  
   374  func (s currentSorter) Swap(i, j int) {
   375  	s[i], s[j] = s[j], s[i]
   376  }