golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/retrybuilds/retrybuilds.go (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // The retrybuilds command clears build failures from the build.golang.org dashboard
     6  // to force them to be rebuilt.
     7  //
     8  // Valid usage modes:
     9  //
    10  //	retrybuilds -loghash=f45f0eb8
    11  //	retrybuilds -builder=openbsd-amd64
    12  //	retrybuilds -builder=openbsd-amd64 -hash=6fecb7
    13  //	retrybuilds -redo-flaky
    14  //	retrybuilds -redo-flaky -builder=linux-amd64-clang
    15  //	retrybuilds -substr="failed to find foo"
    16  //	retrybuilds -substr="failed to find foo" -builder=linux-amd64-stretch
    17  package main
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"crypto/hmac"
    23  	"crypto/md5"
    24  	"encoding/json"
    25  	"flag"
    26  	"fmt"
    27  	"io"
    28  	"log"
    29  	"net/http"
    30  	"net/url"
    31  	"os"
    32  	"path/filepath"
    33  	"strings"
    34  	"sync"
    35  	"time"
    36  
    37  	"golang.org/x/build/buildenv"
    38  	"golang.org/x/build/cmd/coordinator/protos"
    39  	"golang.org/x/build/internal/iapclient"
    40  	"golang.org/x/build/internal/secret"
    41  	"google.golang.org/grpc/codes"
    42  	"google.golang.org/grpc/metadata"
    43  	"google.golang.org/grpc/status"
    44  )
    45  
    46  var (
    47  	dryRun        = flag.Bool("dry-run", false, "just report what would've been done, without changing anything")
    48  	masterKeyFile = flag.String("masterkey", filepath.Join(os.Getenv("HOME"), "keys", "gobuilder-master.key"), "path to Go builder master key. If present, the key argument is not necessary")
    49  	keyFile       = flag.String("key", "", "path to key file")
    50  	builder       = flag.String("builder", "", "builder to wipe a result for. Empty means all.")
    51  	hash          = flag.String("hash", "", "Hash to wipe. If empty, all will be wiped.")
    52  	redoFlaky     = flag.Bool("redo-flaky", false, "Reset all flaky builds. If builder is empty, the master key is required.")
    53  	builderPrefix = flag.String("builder-prefix", "https://build.golang.org", "builder URL prefix")
    54  	logHash       = flag.String("loghash", "", "If non-empty, clear the build that failed with this loghash prefix")
    55  	sendMasterKey = flag.Bool("sendmaster", false, "send the master key in request instead of a builder-specific key; allows overriding actions of revoked keys")
    56  	branch        = flag.String("branch", "master", "branch to find flakes from (for use with -redo-flaky)")
    57  	substr        = flag.String("substr", "", "if non-empty, redoes all build failures whose failure logs contain this substring")
    58  	grpcHost      = flag.String("grpc-host", "build.golang.org:443", "use gRPC for communicating with the Coordinator API")
    59  )
    60  
    61  type Failure struct {
    62  	Builder string
    63  	Hash    string
    64  	LogURL  string
    65  }
    66  
    67  func main() {
    68  	log.SetFlags(0)
    69  	buildenv.RegisterStagingFlag()
    70  	flag.Parse()
    71  
    72  	*builderPrefix = strings.TrimSuffix(*builderPrefix, "/")
    73  	ctx := context.Background()
    74  	cc, err := iapclient.GRPCClient(ctx, *grpcHost)
    75  	if err != nil {
    76  		log.Fatalf("grpc.DialContext(_, %q, _) = %v, wanted no error", *grpcHost, err)
    77  	}
    78  	cl := client{
    79  		coordinator: protos.NewCoordinatorClient(cc),
    80  	}
    81  
    82  	if *logHash != "" {
    83  		substr := "/log/" + *logHash
    84  		for _, f := range failures() {
    85  			if strings.Contains(f.LogURL, substr) {
    86  				log.Printf("Restarting %+v", f)
    87  				cl.wipe(f.Builder, f.Hash)
    88  			}
    89  		}
    90  		log.Printf("wiped %d matching failures\n", cl.wiped)
    91  		return
    92  	}
    93  	if *substr != "" {
    94  		foreachFailure(func(f Failure, failLog string) {
    95  			if strings.Contains(failLog, *substr) {
    96  				log.Printf("Restarting %+v", f)
    97  				cl.wipe(f.Builder, f.Hash)
    98  			}
    99  		})
   100  		log.Printf("wiped %d matching failures\n", cl.wiped)
   101  		return
   102  	}
   103  	if *redoFlaky {
   104  		foreachFailure(func(f Failure, failLog string) {
   105  			if isFlaky(failLog) {
   106  				log.Printf("Restarting flaky %+v", f)
   107  				cl.wipe(f.Builder, f.Hash)
   108  			}
   109  		})
   110  		log.Printf("wiped %d matching failures\n", cl.wiped)
   111  		return
   112  	}
   113  	if *builder == "" {
   114  		log.Fatalf("Missing -builder, -redo-flaky, -substr, or -loghash flag.")
   115  	}
   116  	if *hash == "" {
   117  		for _, f := range failures() {
   118  			if f.Builder != *builder {
   119  				continue
   120  			}
   121  			log.Printf("Restarting %+v", f)
   122  			cl.wipe(f.Builder, f.Hash)
   123  		}
   124  		log.Printf("wiped %d matching failures\n", cl.wiped)
   125  		return
   126  	}
   127  	fullHash := fullHash(*hash)
   128  	log.Printf("Restarting %q", fullHash)
   129  	cl.wipe(*builder, fullHash)
   130  	log.Printf("wiped %d matching failures\n", cl.wiped)
   131  }
   132  
   133  func foreachFailure(fn func(f Failure, failLog string)) {
   134  	gate := make(chan bool, 50)
   135  	var wg sync.WaitGroup
   136  	for _, f := range failures() {
   137  		f := f
   138  		if *builder != "" && f.Builder != *builder {
   139  			continue
   140  		}
   141  		gate <- true
   142  		wg.Add(1)
   143  		go func() {
   144  			defer wg.Done()
   145  			defer func() { <-gate }()
   146  			res, err := http.Get(f.LogURL)
   147  			if err != nil {
   148  				log.Fatalf("Error fetching %s: %v", f.LogURL, err)
   149  			}
   150  			failLog, err := io.ReadAll(res.Body)
   151  			res.Body.Close()
   152  			if err != nil {
   153  				log.Fatalf("Error reading %s: %v", f.LogURL, err)
   154  			}
   155  			fn(f, string(failLog))
   156  		}()
   157  	}
   158  	wg.Wait()
   159  }
   160  
   161  var flakePhrases = []string{
   162  	"No space left on device",
   163  	"no space left on device", // solaris case apparently
   164  	"fatal error: error in backend: IO failure on output stream",
   165  	"Boffset: unknown state 0",
   166  	"Bseek: unknown state 0",
   167  	"error exporting repository: exit status",
   168  	"remote error: User Is Over Quota",
   169  	"fatal: remote did not send all necessary objects",
   170  	"Failed to schedule \"", // e.g. Failed to schedule "go_test:archive/tar" test after 3 tries.
   171  	"lookup _xmpp-server._tcp.google.com. on 8.8.8.8:53: dial udp 8.8.8.8:53: i/o timeout",
   172  	"lookup _xmpp-server._tcp.google.com on",
   173  	"lookup gmail.com. on 8.8.8.8:53: dial udp 8.8.8.8:53: i/o timeout",
   174  	"lookup gmail.com on 8.8.8.8:53",
   175  	"lookup www.mit.edu on ",
   176  	"undefined: runtime.SetMutexProfileFraction", // ppc64 builders had not-quite-go1.8 bootstrap
   177  	"make.bat: The parameter is incorrect",
   178  	"killed",
   179  	"memory",
   180  	"allocate",
   181  	"Killed",
   182  	"Error running API checker: exit status 1",
   183  	"/compile: exit status 1",
   184  	"cmd/link: exit status 1",
   185  }
   186  
   187  func isFlaky(failLog string) bool {
   188  	if strings.Count(strings.TrimSpace(failLog), "\n") < 2 {
   189  		return true
   190  	}
   191  	if strings.HasPrefix(failLog, "exit status ") {
   192  		return true
   193  	}
   194  	if strings.HasPrefix(failLog, "timed out after ") {
   195  		return true
   196  	}
   197  	if strings.HasPrefix(failLog, "Failed to schedule ") {
   198  		return true
   199  	}
   200  	for _, phrase := range flakePhrases {
   201  		if strings.Contains(failLog, phrase) {
   202  			return true
   203  		}
   204  	}
   205  	numLines := strings.Count(failLog, "\n")
   206  	if numLines < 20 && strings.Contains(failLog, "error: exit status") {
   207  		return true
   208  	}
   209  	// e.g. fatal: destination path 'go.tools.TMP' already exists and is not an empty directory.
   210  	// To be fixed in golang.org/issue/9407
   211  	if strings.Contains(failLog, "fatal: destination path '") &&
   212  		strings.Contains(failLog, "' already exists and is not an empty directory.") {
   213  		return true
   214  	}
   215  	return false
   216  }
   217  
   218  func fullHash(h string) string {
   219  	if len(h) == 40 {
   220  		return h
   221  	}
   222  	if h != "" {
   223  		for _, f := range failures() {
   224  			if strings.HasPrefix(f.Hash, h) {
   225  				return f.Hash
   226  			}
   227  		}
   228  	}
   229  	log.Fatalf("invalid hash %q; failed to finds its full hash. Not a recent failure?", h)
   230  	panic("unreachable")
   231  }
   232  
   233  type client struct {
   234  	coordinator protos.CoordinatorClient
   235  	wiped       int // wiped is how many build results have been wiped.
   236  }
   237  
   238  // grpcWipe wipes a git hash failure for the provided builder and hash.
   239  // Only the main Go repo is currently supported.
   240  // TODO(golang.org/issue/34744) - replace HTTP wipe with this after gRPC API for ClearResults is deployed
   241  func (c *client) grpcWipe(builder, hash string) {
   242  	md := metadata.New(map[string]string{"coordinator-authorization": "builder " + builderKey(builder)})
   243  	for i := 0; i < 10; i++ {
   244  		ctx, cancel := context.WithTimeout(metadata.NewOutgoingContext(context.Background(), md), time.Minute)
   245  		resp, err := c.coordinator.ClearResults(ctx, &protos.ClearResultsRequest{
   246  			Builder: builder,
   247  			Hash:    hash,
   248  		})
   249  		cancel()
   250  
   251  		if err != nil {
   252  			s, _ := status.FromError(err)
   253  			switch s.Code() {
   254  			case codes.Aborted:
   255  				log.Printf("Concurrent datastore transaction wiping %v %v: retrying in 1 second", builder, hash)
   256  				time.Sleep(time.Second)
   257  			case codes.DeadlineExceeded:
   258  				log.Printf("Timeout wiping %v %v: retrying", builder, hash)
   259  			default:
   260  				log.Fatalln(err)
   261  			}
   262  			continue
   263  		}
   264  		log.Printf("cl.ClearResults(%q, %q) = %v: resp: %v", builder, hash, status.Code(err), resp)
   265  		c.wiped++
   266  		return
   267  	}
   268  }
   269  
   270  // wipe wipes the git hash failure for the provided failure.
   271  // Only the main go repo is currently supported.
   272  func (c *client) wipe(builder, hash string) {
   273  	if *dryRun {
   274  		c.wiped++ // Pretend.
   275  		return
   276  	}
   277  	if *grpcHost != "" {
   278  		// TODO(golang.org/issue/34744) - Remove HTTP logic after gRPC API for ClearResults is deployed
   279  		// to the Coordinator.
   280  		c.grpcWipe(builder, hash)
   281  		return
   282  	}
   283  	vals := url.Values{
   284  		"builder": {builder},
   285  		"hash":    {hash},
   286  		"key":     {builderKey(builder)},
   287  	}
   288  	for i := 0; i < 10; i++ {
   289  		res, err := http.PostForm(*builderPrefix+"/clear-results?"+vals.Encode(), nil)
   290  		if err != nil {
   291  			log.Fatal(err)
   292  		}
   293  		body, err := io.ReadAll(res.Body)
   294  		res.Body.Close()
   295  		if err != nil {
   296  			log.Fatal(err)
   297  		}
   298  		if res.StatusCode != 200 {
   299  			log.Fatalf("Error clearing %v hash %q: %v", builder, hash, res.Status)
   300  		}
   301  		var dashResponse struct {
   302  			Error string
   303  		}
   304  		if err := json.Unmarshal(body, &dashResponse); err != nil {
   305  			log.Fatalf("Bad dashboard response: %v\nBody: %s", err, body)
   306  		}
   307  
   308  		switch e := dashResponse.Error; e {
   309  		case "datastore: concurrent transaction":
   310  			log.Printf("Concurrent datastore transaction wiping %v %v: retrying in 1 second", builder, hash)
   311  			time.Sleep(time.Second)
   312  			continue
   313  		default:
   314  			log.Fatalf("Dashboard error: %v", e)
   315  		case "":
   316  			c.wiped++
   317  			return
   318  		}
   319  	}
   320  	log.Fatalf("Too many datastore transaction issues wiping %v %v", builder, hash)
   321  }
   322  
   323  func builderKey(builder string) string {
   324  	if v, ok := builderKeyFromMaster(builder); ok {
   325  		return v
   326  	}
   327  	if *keyFile == "" {
   328  		log.Fatalf("No --key specified for builder %s", builder)
   329  	}
   330  	slurp, err := os.ReadFile(*keyFile)
   331  	if err != nil {
   332  		log.Fatalf("Error reading builder key %s: %v", builder, err)
   333  	}
   334  	return strings.TrimSpace(string(slurp))
   335  }
   336  
   337  func builderKeyFromMaster(builder string) (key string, ok bool) {
   338  	masterKey, err := getMasterKeyFromSecretManager()
   339  	if err != nil {
   340  		slurp, err := os.ReadFile(*masterKeyFile)
   341  		if err != nil {
   342  			return "", false
   343  		}
   344  		masterKey = string(bytes.TrimSpace(slurp))
   345  	}
   346  	if *sendMasterKey {
   347  		return masterKey, true
   348  	}
   349  	h := hmac.New(md5.New, []byte(masterKey))
   350  	h.Write([]byte(builder))
   351  	return fmt.Sprintf("%x", h.Sum(nil)), true
   352  }
   353  
   354  // getMasterKeyFromSecretManager retrieves the master key
   355  // from the secret manager service.
   356  func getMasterKeyFromSecretManager() (string, error) {
   357  	sc, err := secret.NewClientInProject(buildenv.FromFlags().ProjectName)
   358  	if err != nil {
   359  		return "", err
   360  	}
   361  	defer sc.Close()
   362  	return sc.Retrieve(context.Background(), secret.NameBuilderMasterKey)
   363  }
   364  
   365  var (
   366  	failMu    sync.Mutex
   367  	failCache []Failure
   368  )
   369  
   370  func failures() (ret []Failure) {
   371  	failMu.Lock()
   372  	ret = failCache
   373  	failMu.Unlock()
   374  	if ret != nil {
   375  		return
   376  	}
   377  	ret = []Failure{} // non-nil
   378  
   379  	res, err := http.Get(*builderPrefix + "/?mode=failures&branch=" + url.QueryEscape(*branch))
   380  	if err != nil {
   381  		log.Fatal(err)
   382  	}
   383  	slurp, err := io.ReadAll(res.Body)
   384  	res.Body.Close()
   385  	if err != nil {
   386  		log.Fatal(err)
   387  	}
   388  	body := string(slurp)
   389  	for _, line := range strings.Split(body, "\n") {
   390  		f := strings.Fields(line)
   391  		if len(f) == 3 {
   392  			ret = append(ret, Failure{
   393  				Hash:    f[0],
   394  				Builder: f[1],
   395  				LogURL:  f[2],
   396  			})
   397  		}
   398  	}
   399  
   400  	failMu.Lock()
   401  	failCache = ret
   402  	failMu.Unlock()
   403  	return ret
   404  }