golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/retrybuilds/retrybuilds.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // The retrybuilds command clears build failures from the build.golang.org dashboard 6 // to force them to be rebuilt. 7 // 8 // Valid usage modes: 9 // 10 // retrybuilds -loghash=f45f0eb8 11 // retrybuilds -builder=openbsd-amd64 12 // retrybuilds -builder=openbsd-amd64 -hash=6fecb7 13 // retrybuilds -redo-flaky 14 // retrybuilds -redo-flaky -builder=linux-amd64-clang 15 // retrybuilds -substr="failed to find foo" 16 // retrybuilds -substr="failed to find foo" -builder=linux-amd64-stretch 17 package main 18 19 import ( 20 "bytes" 21 "context" 22 "crypto/hmac" 23 "crypto/md5" 24 "encoding/json" 25 "flag" 26 "fmt" 27 "io" 28 "log" 29 "net/http" 30 "net/url" 31 "os" 32 "path/filepath" 33 "strings" 34 "sync" 35 "time" 36 37 "golang.org/x/build/buildenv" 38 "golang.org/x/build/cmd/coordinator/protos" 39 "golang.org/x/build/internal/iapclient" 40 "golang.org/x/build/internal/secret" 41 "google.golang.org/grpc/codes" 42 "google.golang.org/grpc/metadata" 43 "google.golang.org/grpc/status" 44 ) 45 46 var ( 47 dryRun = flag.Bool("dry-run", false, "just report what would've been done, without changing anything") 48 masterKeyFile = flag.String("masterkey", filepath.Join(os.Getenv("HOME"), "keys", "gobuilder-master.key"), "path to Go builder master key. If present, the key argument is not necessary") 49 keyFile = flag.String("key", "", "path to key file") 50 builder = flag.String("builder", "", "builder to wipe a result for. Empty means all.") 51 hash = flag.String("hash", "", "Hash to wipe. If empty, all will be wiped.") 52 redoFlaky = flag.Bool("redo-flaky", false, "Reset all flaky builds. If builder is empty, the master key is required.") 53 builderPrefix = flag.String("builder-prefix", "https://build.golang.org", "builder URL prefix") 54 logHash = flag.String("loghash", "", "If non-empty, clear the build that failed with this loghash prefix") 55 sendMasterKey = flag.Bool("sendmaster", false, "send the master key in request instead of a builder-specific key; allows overriding actions of revoked keys") 56 branch = flag.String("branch", "master", "branch to find flakes from (for use with -redo-flaky)") 57 substr = flag.String("substr", "", "if non-empty, redoes all build failures whose failure logs contain this substring") 58 grpcHost = flag.String("grpc-host", "build.golang.org:443", "use gRPC for communicating with the Coordinator API") 59 ) 60 61 type Failure struct { 62 Builder string 63 Hash string 64 LogURL string 65 } 66 67 func main() { 68 log.SetFlags(0) 69 buildenv.RegisterStagingFlag() 70 flag.Parse() 71 72 *builderPrefix = strings.TrimSuffix(*builderPrefix, "/") 73 ctx := context.Background() 74 cc, err := iapclient.GRPCClient(ctx, *grpcHost) 75 if err != nil { 76 log.Fatalf("grpc.DialContext(_, %q, _) = %v, wanted no error", *grpcHost, err) 77 } 78 cl := client{ 79 coordinator: protos.NewCoordinatorClient(cc), 80 } 81 82 if *logHash != "" { 83 substr := "/log/" + *logHash 84 for _, f := range failures() { 85 if strings.Contains(f.LogURL, substr) { 86 log.Printf("Restarting %+v", f) 87 cl.wipe(f.Builder, f.Hash) 88 } 89 } 90 log.Printf("wiped %d matching failures\n", cl.wiped) 91 return 92 } 93 if *substr != "" { 94 foreachFailure(func(f Failure, failLog string) { 95 if strings.Contains(failLog, *substr) { 96 log.Printf("Restarting %+v", f) 97 cl.wipe(f.Builder, f.Hash) 98 } 99 }) 100 log.Printf("wiped %d matching failures\n", cl.wiped) 101 return 102 } 103 if *redoFlaky { 104 foreachFailure(func(f Failure, failLog string) { 105 if isFlaky(failLog) { 106 log.Printf("Restarting flaky %+v", f) 107 cl.wipe(f.Builder, f.Hash) 108 } 109 }) 110 log.Printf("wiped %d matching failures\n", cl.wiped) 111 return 112 } 113 if *builder == "" { 114 log.Fatalf("Missing -builder, -redo-flaky, -substr, or -loghash flag.") 115 } 116 if *hash == "" { 117 for _, f := range failures() { 118 if f.Builder != *builder { 119 continue 120 } 121 log.Printf("Restarting %+v", f) 122 cl.wipe(f.Builder, f.Hash) 123 } 124 log.Printf("wiped %d matching failures\n", cl.wiped) 125 return 126 } 127 fullHash := fullHash(*hash) 128 log.Printf("Restarting %q", fullHash) 129 cl.wipe(*builder, fullHash) 130 log.Printf("wiped %d matching failures\n", cl.wiped) 131 } 132 133 func foreachFailure(fn func(f Failure, failLog string)) { 134 gate := make(chan bool, 50) 135 var wg sync.WaitGroup 136 for _, f := range failures() { 137 f := f 138 if *builder != "" && f.Builder != *builder { 139 continue 140 } 141 gate <- true 142 wg.Add(1) 143 go func() { 144 defer wg.Done() 145 defer func() { <-gate }() 146 res, err := http.Get(f.LogURL) 147 if err != nil { 148 log.Fatalf("Error fetching %s: %v", f.LogURL, err) 149 } 150 failLog, err := io.ReadAll(res.Body) 151 res.Body.Close() 152 if err != nil { 153 log.Fatalf("Error reading %s: %v", f.LogURL, err) 154 } 155 fn(f, string(failLog)) 156 }() 157 } 158 wg.Wait() 159 } 160 161 var flakePhrases = []string{ 162 "No space left on device", 163 "no space left on device", // solaris case apparently 164 "fatal error: error in backend: IO failure on output stream", 165 "Boffset: unknown state 0", 166 "Bseek: unknown state 0", 167 "error exporting repository: exit status", 168 "remote error: User Is Over Quota", 169 "fatal: remote did not send all necessary objects", 170 "Failed to schedule \"", // e.g. Failed to schedule "go_test:archive/tar" test after 3 tries. 171 "lookup _xmpp-server._tcp.google.com. on 8.8.8.8:53: dial udp 8.8.8.8:53: i/o timeout", 172 "lookup _xmpp-server._tcp.google.com on", 173 "lookup gmail.com. on 8.8.8.8:53: dial udp 8.8.8.8:53: i/o timeout", 174 "lookup gmail.com on 8.8.8.8:53", 175 "lookup www.mit.edu on ", 176 "undefined: runtime.SetMutexProfileFraction", // ppc64 builders had not-quite-go1.8 bootstrap 177 "make.bat: The parameter is incorrect", 178 "killed", 179 "memory", 180 "allocate", 181 "Killed", 182 "Error running API checker: exit status 1", 183 "/compile: exit status 1", 184 "cmd/link: exit status 1", 185 } 186 187 func isFlaky(failLog string) bool { 188 if strings.Count(strings.TrimSpace(failLog), "\n") < 2 { 189 return true 190 } 191 if strings.HasPrefix(failLog, "exit status ") { 192 return true 193 } 194 if strings.HasPrefix(failLog, "timed out after ") { 195 return true 196 } 197 if strings.HasPrefix(failLog, "Failed to schedule ") { 198 return true 199 } 200 for _, phrase := range flakePhrases { 201 if strings.Contains(failLog, phrase) { 202 return true 203 } 204 } 205 numLines := strings.Count(failLog, "\n") 206 if numLines < 20 && strings.Contains(failLog, "error: exit status") { 207 return true 208 } 209 // e.g. fatal: destination path 'go.tools.TMP' already exists and is not an empty directory. 210 // To be fixed in golang.org/issue/9407 211 if strings.Contains(failLog, "fatal: destination path '") && 212 strings.Contains(failLog, "' already exists and is not an empty directory.") { 213 return true 214 } 215 return false 216 } 217 218 func fullHash(h string) string { 219 if len(h) == 40 { 220 return h 221 } 222 if h != "" { 223 for _, f := range failures() { 224 if strings.HasPrefix(f.Hash, h) { 225 return f.Hash 226 } 227 } 228 } 229 log.Fatalf("invalid hash %q; failed to finds its full hash. Not a recent failure?", h) 230 panic("unreachable") 231 } 232 233 type client struct { 234 coordinator protos.CoordinatorClient 235 wiped int // wiped is how many build results have been wiped. 236 } 237 238 // grpcWipe wipes a git hash failure for the provided builder and hash. 239 // Only the main Go repo is currently supported. 240 // TODO(golang.org/issue/34744) - replace HTTP wipe with this after gRPC API for ClearResults is deployed 241 func (c *client) grpcWipe(builder, hash string) { 242 md := metadata.New(map[string]string{"coordinator-authorization": "builder " + builderKey(builder)}) 243 for i := 0; i < 10; i++ { 244 ctx, cancel := context.WithTimeout(metadata.NewOutgoingContext(context.Background(), md), time.Minute) 245 resp, err := c.coordinator.ClearResults(ctx, &protos.ClearResultsRequest{ 246 Builder: builder, 247 Hash: hash, 248 }) 249 cancel() 250 251 if err != nil { 252 s, _ := status.FromError(err) 253 switch s.Code() { 254 case codes.Aborted: 255 log.Printf("Concurrent datastore transaction wiping %v %v: retrying in 1 second", builder, hash) 256 time.Sleep(time.Second) 257 case codes.DeadlineExceeded: 258 log.Printf("Timeout wiping %v %v: retrying", builder, hash) 259 default: 260 log.Fatalln(err) 261 } 262 continue 263 } 264 log.Printf("cl.ClearResults(%q, %q) = %v: resp: %v", builder, hash, status.Code(err), resp) 265 c.wiped++ 266 return 267 } 268 } 269 270 // wipe wipes the git hash failure for the provided failure. 271 // Only the main go repo is currently supported. 272 func (c *client) wipe(builder, hash string) { 273 if *dryRun { 274 c.wiped++ // Pretend. 275 return 276 } 277 if *grpcHost != "" { 278 // TODO(golang.org/issue/34744) - Remove HTTP logic after gRPC API for ClearResults is deployed 279 // to the Coordinator. 280 c.grpcWipe(builder, hash) 281 return 282 } 283 vals := url.Values{ 284 "builder": {builder}, 285 "hash": {hash}, 286 "key": {builderKey(builder)}, 287 } 288 for i := 0; i < 10; i++ { 289 res, err := http.PostForm(*builderPrefix+"/clear-results?"+vals.Encode(), nil) 290 if err != nil { 291 log.Fatal(err) 292 } 293 body, err := io.ReadAll(res.Body) 294 res.Body.Close() 295 if err != nil { 296 log.Fatal(err) 297 } 298 if res.StatusCode != 200 { 299 log.Fatalf("Error clearing %v hash %q: %v", builder, hash, res.Status) 300 } 301 var dashResponse struct { 302 Error string 303 } 304 if err := json.Unmarshal(body, &dashResponse); err != nil { 305 log.Fatalf("Bad dashboard response: %v\nBody: %s", err, body) 306 } 307 308 switch e := dashResponse.Error; e { 309 case "datastore: concurrent transaction": 310 log.Printf("Concurrent datastore transaction wiping %v %v: retrying in 1 second", builder, hash) 311 time.Sleep(time.Second) 312 continue 313 default: 314 log.Fatalf("Dashboard error: %v", e) 315 case "": 316 c.wiped++ 317 return 318 } 319 } 320 log.Fatalf("Too many datastore transaction issues wiping %v %v", builder, hash) 321 } 322 323 func builderKey(builder string) string { 324 if v, ok := builderKeyFromMaster(builder); ok { 325 return v 326 } 327 if *keyFile == "" { 328 log.Fatalf("No --key specified for builder %s", builder) 329 } 330 slurp, err := os.ReadFile(*keyFile) 331 if err != nil { 332 log.Fatalf("Error reading builder key %s: %v", builder, err) 333 } 334 return strings.TrimSpace(string(slurp)) 335 } 336 337 func builderKeyFromMaster(builder string) (key string, ok bool) { 338 masterKey, err := getMasterKeyFromSecretManager() 339 if err != nil { 340 slurp, err := os.ReadFile(*masterKeyFile) 341 if err != nil { 342 return "", false 343 } 344 masterKey = string(bytes.TrimSpace(slurp)) 345 } 346 if *sendMasterKey { 347 return masterKey, true 348 } 349 h := hmac.New(md5.New, []byte(masterKey)) 350 h.Write([]byte(builder)) 351 return fmt.Sprintf("%x", h.Sum(nil)), true 352 } 353 354 // getMasterKeyFromSecretManager retrieves the master key 355 // from the secret manager service. 356 func getMasterKeyFromSecretManager() (string, error) { 357 sc, err := secret.NewClientInProject(buildenv.FromFlags().ProjectName) 358 if err != nil { 359 return "", err 360 } 361 defer sc.Close() 362 return sc.Retrieve(context.Background(), secret.NameBuilderMasterKey) 363 } 364 365 var ( 366 failMu sync.Mutex 367 failCache []Failure 368 ) 369 370 func failures() (ret []Failure) { 371 failMu.Lock() 372 ret = failCache 373 failMu.Unlock() 374 if ret != nil { 375 return 376 } 377 ret = []Failure{} // non-nil 378 379 res, err := http.Get(*builderPrefix + "/?mode=failures&branch=" + url.QueryEscape(*branch)) 380 if err != nil { 381 log.Fatal(err) 382 } 383 slurp, err := io.ReadAll(res.Body) 384 res.Body.Close() 385 if err != nil { 386 log.Fatal(err) 387 } 388 body := string(slurp) 389 for _, line := range strings.Split(body, "\n") { 390 f := strings.Fields(line) 391 if len(f) == 3 { 392 ret = append(ret, Failure{ 393 Hash: f[0], 394 Builder: f[1], 395 LogURL: f[2], 396 }) 397 } 398 } 399 400 failMu.Lock() 401 failCache = ret 402 failMu.Unlock() 403 return ret 404 }