github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/github-post/main.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/github-post/main.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  // Command github-post parses the JSON-formatted output from a Go test session,
    12  // as generated by either 'go test -json' or './pkg.test | go tool test2json -t',
    13  // and posts issues for any failed tests to GitHub. If there are no failed
    14  // tests, it assumes that there was a build error and posts the entire log to
    15  // GitHub.
    16  package main
    17  
    18  import (
    19  	"bufio"
    20  	"bytes"
    21  	"context"
    22  	"encoding/json"
    23  	"fmt"
    24  	"io"
    25  	"io/ioutil"
    26  	"log"
    27  	"os"
    28  	"os/exec"
    29  	"regexp"
    30  	"sort"
    31  	"strconv"
    32  	"strings"
    33  	"time"
    34  
    35  	"github.com/cockroachdb/cockroach/pkg/cmd/internal/issues"
    36  	"github.com/cockroachdb/errors"
    37  )
    38  
    39  const (
    40  	pkgEnv = "PKG"
    41  )
    42  
    43  func main() {
    44  	ctx := context.Background()
    45  
    46  	f := func(ctx context.Context, title, packageName, testName, testMessage, authorEmail string) error {
    47  		log.Printf("filing issue with title: %s", title)
    48  		req := issues.PostRequest{
    49  			// TODO(tbg): actually use this as a template and not a hard-coded
    50  			// string.
    51  			TitleTemplate: title,
    52  			BodyTemplate:  issues.UnitTestFailureBody,
    53  			PackageName:   packageName,
    54  			TestName:      testName,
    55  			Message:       testMessage,
    56  			Artifacts:     "",
    57  			AuthorEmail:   authorEmail,
    58  		}
    59  		return issues.Post(ctx, req)
    60  	}
    61  
    62  	if err := listFailures(ctx, os.Stdin, f); err != nil {
    63  		log.Fatal(err)
    64  	}
    65  }
    66  
    67  // This struct is described in the test2json documentation.
    68  // https://golang.org/cmd/test2json/
    69  type testEvent struct {
    70  	Action  string
    71  	Package string
    72  	Test    string
    73  	Output  string
    74  	Time    time.Time // encodes as an RFC3339-format string
    75  	Elapsed float64   // seconds
    76  }
    77  
    78  type scopedTest struct {
    79  	pkg  string
    80  	name string
    81  }
    82  
    83  func scoped(te testEvent) scopedTest {
    84  	if te.Package == "" {
    85  		return scopedTest{pkg: mustPkgFromEnv(), name: te.Test}
    86  	}
    87  	return scopedTest{pkg: te.Package, name: te.Test}
    88  }
    89  
    90  func mustPkgFromEnv() string {
    91  	packageName := maybePkgFromEnv()
    92  	if packageName == "" {
    93  		panic(errors.Errorf("package name environment variable %s is not set", pkgEnv))
    94  	}
    95  	return packageName
    96  }
    97  
    98  func maybePkgFromEnv() string {
    99  	packageName, ok := os.LookupEnv(pkgEnv)
   100  	if !ok {
   101  		return ""
   102  	}
   103  	return packageName
   104  }
   105  
   106  func trimPkg(pkg string) string {
   107  	return strings.TrimPrefix(pkg, issues.CockroachPkgPrefix)
   108  }
   109  
   110  func listFailures(
   111  	ctx context.Context,
   112  	input io.Reader,
   113  	f func(ctx context.Context, title, packageName, testName, testMessage, authorEmail string) error,
   114  ) error {
   115  	// Tests that took less than this are not even considered for slow test
   116  	// reporting. This is so that we protect against large number of
   117  	// programatically-generated subtests.
   118  	const shortTestFilterSecs float64 = 0.5
   119  	var timeoutMsg = "panic: test timed out after"
   120  
   121  	var packageOutput bytes.Buffer
   122  
   123  	// map  from test name to list of events (each log line is an event, plus
   124  	// start and pass/fail events).
   125  	// Tests/events are "outstanding" until we see a final pass/fail event.
   126  	// Because of the way the go test runner prints output, in case a subtest times
   127  	// out or panics, we don't get a pass/fail event for sibling and ancestor
   128  	// tests. Those tests will remain "outstanding" and will be ignored for the
   129  	// purpose of issue reporting.
   130  	outstandingOutput := make(map[scopedTest][]testEvent)
   131  	failures := make(map[scopedTest][]testEvent)
   132  	var slowPassEvents []testEvent
   133  	var slowFailEvents []testEvent
   134  
   135  	// init is true for the preamble of the input before the first "run" test
   136  	// event.
   137  	init := true
   138  	// trustTimestamps will be set if we don't find a marker suggesting that the
   139  	// input comes from a stress run. In that case, stress prints all its output
   140  	// at once (for a captured failed test run), so the test2json timestamps are
   141  	// meaningless.
   142  	trustTimestamps := true
   143  	// elapsedTotalSec accumulates the time spent in all tests, passing or
   144  	// failing. In case the input comes from a stress run, this will be used to
   145  	// deduce the duration of a timed out test.
   146  	var elapsedTotalSec float64
   147  	// Will be set if the last test timed out.
   148  	var timedOutCulprit scopedTest
   149  	var timedOutEvent testEvent
   150  	var curTestStart time.Time
   151  	var last scopedTest
   152  	var lastEvent testEvent
   153  	scanner := bufio.NewScanner(input)
   154  	for scanner.Scan() {
   155  		var te testEvent
   156  		{
   157  			line := scanner.Text() // has no EOL marker
   158  			if len(line) <= 2 || line[0] != '{' || line[len(line)-1] != '}' {
   159  				// This line is not test2json output, skip it. This can happen if
   160  				// whatever feeds our input has some extra non-JSON lines such as
   161  				// would happen with `make` invocations.
   162  				continue
   163  			}
   164  			if err := json.Unmarshal([]byte(line), &te); err != nil {
   165  				return errors.Wrapf(err, "unable to parse %q", line)
   166  			}
   167  		}
   168  		lastEvent = te
   169  
   170  		if te.Test != "" {
   171  			init = false
   172  		}
   173  		if init && strings.Contains(te.Output, "-exec 'stress '") {
   174  			trustTimestamps = false
   175  		}
   176  		if timedOutCulprit.name == "" && te.Elapsed > 0 {
   177  			// We don't count subtests as those are counted in the parent.
   178  			if split := strings.SplitN(te.Test, "/", 2); len(split) == 1 {
   179  				elapsedTotalSec += te.Elapsed
   180  			}
   181  		}
   182  
   183  		if timedOutCulprit.name == te.Test && te.Elapsed != 0 {
   184  			te.Elapsed = timedOutEvent.Elapsed
   185  		}
   186  
   187  		// Events for the overall package test do not set Test.
   188  		if len(te.Test) > 0 {
   189  			switch te.Action {
   190  			case "run":
   191  				last = scoped(te)
   192  				if trustTimestamps {
   193  					curTestStart = te.Time
   194  				}
   195  			case "output":
   196  				key := scoped(te)
   197  				outstandingOutput[key] = append(outstandingOutput[key], te)
   198  				if strings.Contains(te.Output, timeoutMsg) {
   199  					timedOutCulprit = key
   200  
   201  					// Fill in the Elapsed field for a timeout event.
   202  					// As of go1.11, the Elapsed field is bogus for fail events for timed
   203  					// out tests, so we do our own computation.
   204  					// See https://github.com/golang/go/issues/27568
   205  					//
   206  					// Also, if the input is coming from stress, there will not even be a
   207  					// fail event for the test, so the Elapsed field computed here will be
   208  					// useful.
   209  					if trustTimestamps {
   210  						te.Elapsed = te.Time.Sub(curTestStart).Seconds()
   211  					} else {
   212  						// If we don't trust the timestamps, then we compute the test's
   213  						// duration by subtracting all the durations that we've seen so far
   214  						// (which we do trust to some extent). Note that this is not
   215  						// entirely accurate, since there's no information about the
   216  						// duration about sibling subtests which may have run. And further
   217  						// note that it doesn't work well at all for small timeouts because
   218  						// the resolution that the test durations have is just tens of
   219  						// milliseconds, so many quick tests are rounded of to a duration of
   220  						// 0.
   221  						re := regexp.MustCompile(`panic: test timed out after (\d*(?:\.\d*)?)(.)`)
   222  						matches := re.FindStringSubmatch(te.Output)
   223  						if matches == nil {
   224  							log.Printf("failed to parse timeout message: %s", te.Output)
   225  							te.Elapsed = -1
   226  						} else {
   227  							dur, err := strconv.ParseFloat(matches[1], 64)
   228  							if err != nil {
   229  								log.Fatal(err)
   230  							}
   231  							if matches[2] == "m" {
   232  								// minutes to seconds
   233  								dur *= 60
   234  							} else if matches[2] != "s" {
   235  								log.Fatalf("unexpected time unit in: %s", te.Output)
   236  							}
   237  							te.Elapsed = dur - elapsedTotalSec
   238  						}
   239  					}
   240  					timedOutEvent = te
   241  				}
   242  			case "pass", "skip":
   243  				if timedOutCulprit.name != "" {
   244  					panic(fmt.Sprintf("detected test timeout but test seems to have passed (%+v)", te))
   245  				}
   246  				delete(outstandingOutput, scoped(te))
   247  				if te.Elapsed > shortTestFilterSecs {
   248  					// We ignore subtests; their time contributes to the parent's.
   249  					if !strings.Contains(te.Test, "/") {
   250  						slowPassEvents = append(slowPassEvents, te)
   251  					}
   252  				}
   253  			case "fail":
   254  				key := scoped(te)
   255  				// Record slow tests. We ignore subtests; their time contributes to the
   256  				// parent's. Except the timed out (sub)test, for which the parent (if
   257  				// any) is not going to appear in the report because there's not going
   258  				// to be a pass/fail event for it.
   259  				if !strings.Contains(te.Test, "/") || timedOutCulprit == key {
   260  					slowFailEvents = append(slowFailEvents, te)
   261  				}
   262  				// Move the test to the failures collection unless the test timed out.
   263  				// We have special reporting for timeouts below.
   264  				if timedOutCulprit != key {
   265  					failures[key] = outstandingOutput[key]
   266  				}
   267  				delete(outstandingOutput, key)
   268  			}
   269  		} else if te.Action == "output" {
   270  			// Output was outside the context of a test. This consists mostly of the
   271  			// preamble and epilogue that Make outputs, but also any log messages that
   272  			// are printed by a test binary's main function.
   273  			packageOutput.WriteString(te.Output)
   274  		}
   275  	}
   276  
   277  	// On timeout, we might or might not have gotten a fail event for the timed
   278  	// out test (we seem to get one when processing output from a test binary run,
   279  	// but not when processing the output of `stress`, which adds some lines at
   280  	// the end). If we haven't gotten a fail event, the test's output is still
   281  	// outstanding and the test is not registered in the slowFailEvents
   282  	// collection. The timeout handling code below relies on slowFailEvents not
   283  	// being empty though, so we'll process the test here.
   284  	if timedOutCulprit.name != "" {
   285  		if _, ok := outstandingOutput[timedOutCulprit]; ok {
   286  			slowFailEvents = append(slowFailEvents, timedOutEvent)
   287  			delete(outstandingOutput, timedOutCulprit)
   288  		}
   289  	} else {
   290  		// If we haven't received a final event for the last test, then a
   291  		// panic/log.Fatal must have happened. Consider it failed.
   292  		// Note that because of https://github.com/golang/go/issues/27582 there
   293  		// might be other outstanding tests; we ignore those.
   294  		if _, ok := outstandingOutput[last]; ok {
   295  			log.Printf("found outstanding output. Considering last test failed: %s", last)
   296  			failures[last] = outstandingOutput[last]
   297  		}
   298  	}
   299  
   300  	// test2json always puts a fail event last unless it sees a big pass message
   301  	// from the test output.
   302  	if lastEvent.Action == "fail" && len(failures) == 0 && timedOutCulprit.name == "" {
   303  		// If we couldn't find a failing Go test, assume that a failure occurred
   304  		// before running Go and post an issue about that.
   305  		const unknown = "(unknown)"
   306  		packageName := maybePkgFromEnv()
   307  		if packageName == "" {
   308  			packageName = "unknown"
   309  		}
   310  		trimmedPkgName := trimPkg(packageName)
   311  		title := fmt.Sprintf("%s: package failed", trimmedPkgName)
   312  		if err := f(
   313  			ctx, title, packageName, unknown, packageOutput.String(), "", /* authorEmail */
   314  		); err != nil {
   315  			return errors.Wrap(err, "failed to post issue")
   316  		}
   317  	} else {
   318  		for test, testEvents := range failures {
   319  			if split := strings.SplitN(test.name, "/", 2); len(split) == 2 {
   320  				parentTest, subTest := scopedTest{pkg: test.pkg, name: split[0]}, scopedTest{pkg: test.pkg, name: split[1]}
   321  				log.Printf("consolidating failed subtest %q into parent test %q", subTest.name, parentTest.name)
   322  				failures[parentTest] = append(failures[parentTest], testEvents...)
   323  				delete(failures, test)
   324  			} else {
   325  				log.Printf("failed parent test %q", test)
   326  			}
   327  		}
   328  		// Sort the failed tests to make the unit tests for this script deterministic.
   329  		var failedTestNames []scopedTest
   330  		for name := range failures {
   331  			failedTestNames = append(failedTestNames, name)
   332  		}
   333  		sort.Slice(failedTestNames, func(i, j int) bool {
   334  			return fmt.Sprint(failedTestNames[i]) < fmt.Sprint(failedTestNames[j])
   335  		})
   336  		for _, test := range failedTestNames {
   337  			testEvents := failures[test]
   338  			authorEmail, err := getAuthorEmail(ctx, test.pkg, test.name)
   339  			if err != nil {
   340  				log.Printf("unable to determine test author email: %s\n", err)
   341  			}
   342  			var outputs []string
   343  			for _, testEvent := range testEvents {
   344  				outputs = append(outputs, testEvent.Output)
   345  			}
   346  			message := strings.Join(outputs, "")
   347  			title := fmt.Sprintf("%s: %s failed", trimPkg(test.pkg), test.name)
   348  			if err := f(ctx, title, test.pkg, test.name, message, authorEmail); err != nil {
   349  				return errors.Wrap(err, "failed to post issue")
   350  			}
   351  		}
   352  	}
   353  
   354  	// Sort slow tests descendingly by duration.
   355  	sort.Slice(slowPassEvents, func(i, j int) bool {
   356  		return slowPassEvents[i].Elapsed > slowPassEvents[j].Elapsed
   357  	})
   358  	sort.Slice(slowFailEvents, func(i, j int) bool {
   359  		return slowFailEvents[i].Elapsed > slowFailEvents[j].Elapsed
   360  	})
   361  
   362  	report := genSlowTestsReport(slowPassEvents, slowFailEvents)
   363  	if err := writeSlowTestsReport(report); err != nil {
   364  		log.Printf("failed to create slow tests report: %s", err)
   365  	}
   366  
   367  	// If the run timed out, file an issue. A couple of cases:
   368  	// 1) If the test that was running when the package timed out is the longest
   369  	// test, then we blame it. The common case is the test deadlocking - it would
   370  	// have run forever.
   371  	// 2) Otherwise, we don't blame anybody in particular. We file a generic issue
   372  	// listing the package name containing the report of long-running tests.
   373  	if timedOutCulprit.name != "" {
   374  		slowest := slowFailEvents[0]
   375  		if len(slowPassEvents) > 0 && slowPassEvents[0].Elapsed > slowest.Elapsed {
   376  			slowest = slowPassEvents[0]
   377  		}
   378  		if timedOutCulprit == scoped(slowest) {
   379  			// The test that was running when the timeout hit is the one that ran for
   380  			// the longest time.
   381  			authorEmail, err := getAuthorEmail(ctx, timedOutCulprit.pkg, timedOutCulprit.name)
   382  			if err != nil {
   383  				log.Printf("unable to determine test author email: %s\n", err)
   384  			}
   385  			title := fmt.Sprintf("%s: %s timed out", trimPkg(timedOutCulprit.pkg), timedOutCulprit.name)
   386  			log.Printf("timeout culprit found: %s\n", timedOutCulprit.name)
   387  			if err := f(ctx, title, timedOutCulprit.pkg, timedOutCulprit.name, report, authorEmail); err != nil {
   388  				return errors.Wrap(err, "failed to post issue")
   389  			}
   390  		} else {
   391  			packageName := maybePkgFromEnv()
   392  			if packageName == "" {
   393  				packageName = "unknown"
   394  			}
   395  			trimmedPkgName := trimPkg(packageName)
   396  			title := fmt.Sprintf("%s: package timed out", trimmedPkgName)
   397  			// Andrei gets these reports for now, but don't think I'll fix anything
   398  			// you fools.
   399  			// TODO(andrei): Figure out how to assign to the on-call engineer. Maybe
   400  			// get their name from the Slack channel?
   401  			log.Printf("timeout culprit not found\n")
   402  			if err := f(
   403  				ctx, title, packageName, "(unknown)" /* testName */, report, "andreimatei1@gmail.com",
   404  			); err != nil {
   405  				return errors.Wrap(err, "failed to post issue")
   406  			}
   407  		}
   408  	}
   409  
   410  	return nil
   411  }
   412  
   413  func genSlowTestsReport(slowPassingTests, slowFailingTests []testEvent) string {
   414  	var b strings.Builder
   415  	b.WriteString("Slow failing tests:\n")
   416  	for i, te := range slowFailingTests {
   417  		if i == 20 {
   418  			break
   419  		}
   420  		fmt.Fprintf(&b, "%s - %.2fs\n", te.Test, te.Elapsed)
   421  	}
   422  	if len(slowFailingTests) == 0 {
   423  		fmt.Fprint(&b, "<none>\n")
   424  	}
   425  
   426  	b.WriteString("\nSlow passing tests:\n")
   427  	for i, te := range slowPassingTests {
   428  		if i == 20 {
   429  			break
   430  		}
   431  		fmt.Fprintf(&b, "%s - %.2fs\n", te.Test, te.Elapsed)
   432  	}
   433  	if len(slowPassingTests) == 0 {
   434  		fmt.Fprint(&b, "<none>\n")
   435  	}
   436  	return b.String()
   437  }
   438  
   439  func writeSlowTestsReport(report string) error {
   440  	return ioutil.WriteFile("artifacts/slow-tests-report.txt", []byte(report), 0644)
   441  }
   442  
   443  func getAuthorEmail(ctx context.Context, packageName, testName string) (string, error) {
   444  	// Search the source code for the email address of the last committer to touch
   445  	// the first line of the source code that contains testName. Then, ask GitHub
   446  	// for the GitHub username of the user with that email address by searching
   447  	// commits in cockroachdb/cockroach for commits authored by the address.
   448  	subtests := strings.Split(testName, "/")
   449  	testName = subtests[0]
   450  	packageName = strings.TrimPrefix(packageName, "github.com/cockroachdb/cockroach/")
   451  	cmd := exec.Command(`/bin/bash`, `-c`,
   452  		fmt.Sprintf(`git grep -n "func %s" $(git rev-parse --show-toplevel)/%s/*_test.go`,
   453  			testName, packageName))
   454  	// This command returns output such as:
   455  	// ../ccl/storageccl/export_test.go:31:func TestExportCmd(t *testing.T) {
   456  	out, err := cmd.CombinedOutput()
   457  	if err != nil {
   458  		return "", errors.Errorf("couldn't find test %s in %s: %s %s",
   459  			testName, packageName, err, string(out))
   460  	}
   461  	re := regexp.MustCompile(`(.*):(.*):`)
   462  	// The first 2 :-delimited fields are the filename and line number.
   463  	matches := re.FindSubmatch(out)
   464  	if matches == nil {
   465  		return "", errors.Errorf("couldn't find filename/line number for test %s in %s: %s",
   466  			testName, packageName, string(out))
   467  	}
   468  	filename := matches[1]
   469  	linenum := matches[2]
   470  
   471  	// Now run git blame.
   472  	cmd = exec.Command(`/bin/bash`, `-c`,
   473  		fmt.Sprintf(`git blame --porcelain -L%s,+1 %s | grep author-mail`,
   474  			linenum, filename))
   475  	// This command returns output such as:
   476  	// author-mail <jordan@cockroachlabs.com>
   477  	out, err = cmd.CombinedOutput()
   478  	if err != nil {
   479  		return "", errors.Errorf("couldn't find author of test %s in %s: %s %s",
   480  			testName, packageName, err, string(out))
   481  	}
   482  	re = regexp.MustCompile("author-mail <(.*)>")
   483  	matches = re.FindSubmatch(out)
   484  	if matches == nil {
   485  		return "", errors.Errorf("couldn't find author email of test %s in %s: %s",
   486  			testName, packageName, string(out))
   487  	}
   488  	return string(matches[1]), nil
   489  }