golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/coordinator/coordinator.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build linux || darwin 6 7 // The coordinator runs the majority of the Go build system. 8 // 9 // It is responsible for finding build work, executing it, 10 // and displaying the results. 11 // 12 // For an overview of the Go build system, see the README at 13 // the root of the x/build repo. 14 package main // import "golang.org/x/build/cmd/coordinator" 15 16 import ( 17 "bytes" 18 "context" 19 "crypto/rand" 20 "crypto/sha1" 21 "crypto/tls" 22 "encoding/json" 23 "errors" 24 "flag" 25 "fmt" 26 "html" 27 "io" 28 "log" 29 "net/http" 30 "net/url" 31 "os" 32 "sort" 33 "strings" 34 "sync" 35 "time" 36 "unicode" 37 38 "cloud.google.com/go/compute/metadata" 39 "cloud.google.com/go/storage" 40 "go.chromium.org/luci/auth" 41 buildbucketpb "go.chromium.org/luci/buildbucket/proto" 42 "go.chromium.org/luci/grpc/prpc" 43 "go.chromium.org/luci/hardcoded/chromeinfra" 44 "golang.org/x/build/buildenv" 45 "golang.org/x/build/buildlet" 46 builddash "golang.org/x/build/cmd/coordinator/internal/dashboard" 47 "golang.org/x/build/cmd/coordinator/internal/legacydash" 48 "golang.org/x/build/cmd/coordinator/internal/lucipoll" 49 "golang.org/x/build/cmd/coordinator/protos" 50 "golang.org/x/build/dashboard" 51 "golang.org/x/build/gerrit" 52 "golang.org/x/build/internal/access" 53 "golang.org/x/build/internal/buildgo" 54 "golang.org/x/build/internal/buildstats" 55 "golang.org/x/build/internal/cloud" 56 "golang.org/x/build/internal/coordinator/pool" 57 "golang.org/x/build/internal/coordinator/pool/queue" 58 "golang.org/x/build/internal/coordinator/remote" 59 "golang.org/x/build/internal/coordinator/schedule" 60 "golang.org/x/build/internal/gomote" 61 gomoteprotos "golang.org/x/build/internal/gomote/protos" 62 "golang.org/x/build/internal/https" 63 "golang.org/x/build/internal/metrics" 64 "golang.org/x/build/internal/secret" 65 "golang.org/x/build/kubernetes/gke" 66 "golang.org/x/build/maintner/maintnerd/apipb" 67 "golang.org/x/build/repos" 68 "golang.org/x/build/revdial/v2" 69 "golang.org/x/build/types" 70 "golang.org/x/exp/slices" 71 "golang.org/x/time/rate" 72 "google.golang.org/api/option" 73 "google.golang.org/grpc" 74 "google.golang.org/grpc/credentials" 75 ) 76 77 const ( 78 // eventDone is a build event name meaning the build was 79 // completed (either successfully or with remote errors). 80 // Notably, it is NOT included for network/communication 81 // errors. 82 eventDone = "done" 83 84 // eventSkipBuildMissingDep is a build event name meaning 85 // the builder type is not applicable to the commit being 86 // tested because the commit lacks a necessary dependency 87 // in its git history. 88 eventSkipBuildMissingDep = "skipped_build_missing_dep" 89 ) 90 91 var ( 92 processStartTime = time.Now() 93 processID = "P" + randHex(9) 94 ) 95 96 var sched = schedule.NewScheduler() 97 98 var Version string // set by linker -X 99 100 // devPause is a debug option to pause for 5 minutes after the build 101 // finishes before destroying buildlets. 102 const devPause = false 103 104 // stagingTryWork is a debug option to enable or disable running 105 // trybot work in staging. 106 // 107 // If enabled, only open CLs containing "DO NOT SUBMIT" and "STAGING" 108 // in their commit message (in addition to being marked Run-TryBot+1) 109 // will be run. 110 const stagingTryWork = true 111 112 var ( 113 masterKeyFile = flag.String("masterkey", "", "Path to builder master key. Else fetched using GCE project attribute 'builder-master-key'.") 114 mode = flag.String("mode", "", "Valid modes are 'dev', 'prod', or '' for auto-detect. dev means localhost development, not be confused with staging on go-dashboard-dev, which is still the 'prod' mode.") 115 buildEnvName = flag.String("env", "", "The build environment configuration to use. Not required if running in dev mode locally or prod mode on GCE.") 116 devEnableGCE = flag.Bool("dev_gce", false, "Whether or not to enable the GCE pool when in dev mode. The pool is enabled by default in prod mode.") 117 devEnableEC2 = flag.Bool("dev_ec2", false, "Whether or not to enable the EC2 pool when in dev mode. The pool is enabled by default in prod mode.") 118 sshAddr = flag.String("ssh_addr", ":2222", "Address the gomote SSH server should listen on") 119 ) 120 121 // LOCK ORDER: 122 // statusMu, buildStatus.mu, trySet.mu 123 // (Other locks, such as the remoteBuildlet mutex should 124 // not be used along with other locks) 125 126 var ( 127 statusMu sync.Mutex // guards the following four structures; see LOCK ORDER comment above 128 status = map[buildgo.BuilderRev]*buildStatus{} 129 statusDone []*buildStatus // finished recently, capped to maxStatusDone 130 tries = map[tryKey]*trySet{} // trybot builds 131 tryList []tryKey 132 ) 133 134 var maintnerClient apipb.MaintnerServiceClient 135 136 const ( 137 maxStatusDone = 30 138 ) 139 140 var validHosts = map[string]bool{ 141 "farmer.golang.org": true, 142 "build.golang.org": true, 143 } 144 145 // hostPathHandler infers the host from the first element of the URL path, 146 // and rewrites URLs in the output HTML accordingly. It disables response 147 // compression to simplify the process of link rewriting. 148 func hostPathHandler(h http.Handler) http.Handler { 149 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 150 // Don't bother rewriting ReverseHandler requests. ReverseHandler 151 // must be a Hijacker. Other handlers must not be a Hijacker to 152 // serve HTTP/2 requests. 153 if strings.HasPrefix(r.URL.Path, "/reverse") || strings.HasPrefix(r.URL.Path, "/revdial") { 154 h.ServeHTTP(w, r) 155 return 156 } 157 elem, rest := strings.TrimPrefix(r.URL.Path, "/"), "" 158 if i := strings.Index(elem, "/"); i >= 0 { 159 elem, rest = elem[:i], elem[i+1:] 160 } 161 if !validHosts[elem] { 162 u := "/farmer.golang.org" + r.URL.EscapedPath() 163 if r.URL.RawQuery != "" { 164 u += "?" + r.URL.RawQuery 165 } 166 http.Redirect(w, r, u, http.StatusTemporaryRedirect) 167 return 168 } 169 170 r.Host = elem 171 r.URL.Host = elem 172 r.URL.Path = "/" + rest 173 r.Header.Set("Accept-Encoding", "identity") // Disable compression for link rewriting. 174 lw := &linkRewriter{ResponseWriter: w, host: r.Host} 175 h.ServeHTTP(lw, r) 176 lw.Flush() 177 }) 178 } 179 180 // A linkRewriter is a ResponseWriter that rewrites links in HTML output. 181 // It rewrites relative links /foo to be /host/foo, and it rewrites any link 182 // https://h/foo or //h/foo, where h is in validHosts, to be /h/foo. 183 // This corrects the links to have the right form for the local development mode. 184 type linkRewriter struct { 185 http.ResponseWriter 186 host string 187 buf []byte 188 ct string // content-type 189 } 190 191 func (r *linkRewriter) WriteHeader(code int) { 192 if l := r.Header().Get("Location"); l != "" { 193 if u, err := url.Parse(l); err == nil { 194 if u.Host == "" { 195 u.Path = "/" + r.host + u.Path 196 } else if validHosts[u.Host] { 197 u.Path = "/" + u.Host + u.Path 198 u.Scheme, u.Host = "", "" 199 } 200 r.Header().Set("Location", u.String()) 201 } 202 } 203 r.ResponseWriter.WriteHeader(code) 204 } 205 206 func (r *linkRewriter) Write(data []byte) (int, error) { 207 if r.ct == "" { 208 ct := r.Header().Get("Content-Type") 209 if ct == "" { 210 // Note: should use first 512 bytes, but first write is fine for our purposes. 211 ct = http.DetectContentType(data) 212 } 213 r.ct = ct 214 } 215 if !strings.HasPrefix(r.ct, "text/html") { 216 return r.ResponseWriter.Write(data) 217 } 218 r.buf = append(r.buf, data...) 219 return len(data), nil 220 } 221 222 func (r *linkRewriter) Flush() { 223 var repl []string 224 for host := range validHosts { 225 repl = append(repl, `href="https://`+host, `href="/`+host) 226 repl = append(repl, `href="//`+host, `href="/`+host) // Handle scheme-less URLs. 227 } 228 repl = append(repl, `href="/`, `href="/`+r.host+`/`) 229 strings.NewReplacer(repl...).WriteString(r.ResponseWriter, string(r.buf)) 230 r.buf = nil 231 } 232 233 func main() { 234 https.RegisterFlags(flag.CommandLine) 235 flag.Parse() 236 237 pool.SetProcessMetadata(processID, processStartTime) 238 239 if Version == "" && *mode == "dev" { 240 Version = "dev" 241 } 242 log.Printf("coordinator version %q starting", Version) 243 244 sc := mustCreateSecretClientOnGCE() 245 if sc != nil { 246 defer sc.Close() 247 } 248 249 mustInitMasterKeyCache(sc) 250 251 // TODO(golang.org/issue/38337): remove package level variables where possible. 252 // TODO(golang.org/issue/36841): remove after key functions are moved into 253 // a shared package. 254 pool.SetBuilderMasterKey(masterKey()) 255 sp := remote.NewSessionPool(context.Background()) 256 err := pool.InitGCE(sc, &basePinErr, sp.IsSession, *buildEnvName, *mode) 257 if err != nil { 258 if *mode == "" { 259 *mode = "dev" 260 } 261 log.Printf("VM support disabled due to error initializing GCE: %v", err) 262 } else { 263 if *mode == "" { 264 *mode = "prod" 265 } 266 } 267 268 gce := pool.NewGCEConfiguration() 269 270 if gce.BuildEnv().KubeServices.Name != "" { 271 goKubeClient, err := gke.NewClient(context.Background(), 272 gce.BuildEnv().KubeServices.Name, 273 gce.BuildEnv().KubeServices.Location(), 274 gke.OptNamespace(gce.BuildEnv().KubeServices.Namespace), 275 gke.OptProject(gce.BuildEnv().ProjectName), 276 gke.OptTokenSource(gce.GCPCredentials().TokenSource)) 277 if err != nil { 278 log.Fatalf("connecting to GKE failed: %v", err) 279 } 280 go monitorGitMirror(goKubeClient) 281 } else { 282 log.Println("Kubernetes services disabled due to empty KubeServices.Name") 283 } 284 285 if *mode == "prod" || (*mode == "dev" && *devEnableEC2) { 286 // TODO(golang.org/issues/38337) the coordinator will use a package scoped pool 287 // until the coordinator is refactored to not require them. 288 ec2Pool := mustCreateEC2BuildletPool(sc, sp.IsSession) 289 defer ec2Pool.Close() 290 } 291 292 if *mode == "dev" { 293 // Replace linux-amd64 with a config using a -localdev reverse 294 // buildlet so it is possible to run local builds by starting a 295 // local reverse buildlet. 296 dashboard.Builders["linux-amd64"] = &dashboard.BuildConfig{ 297 Name: "linux-amd64", 298 HostType: "host-linux-amd64-localdev", 299 } 300 dashboard.Builders["linux-amd64-localdev"] = &dashboard.BuildConfig{ 301 Name: "linux-amd64", 302 HostType: "host-linux-amd64-localdev", 303 } 304 } 305 306 go pool.CoordinatorProcess().UpdateInstanceRecord() 307 308 switch *mode { 309 case "dev", "prod": 310 log.Printf("Running in %s mode", *mode) 311 default: 312 log.Fatalf("Unknown mode: %q", *mode) 313 } 314 315 mux := http.NewServeMux() 316 317 if *mode == "dev" { 318 // Serve a mock TryBot Status page at /try-dev. 319 initTryDev(mux) 320 } 321 322 addHealthCheckers(context.Background(), mux, sc) 323 324 gr, err := metrics.GKEResource("coordinator-deployment") 325 if err != nil && metadata.OnGCE() { 326 log.Println("metrics.GKEResource:", err) 327 } 328 if ms, err := metrics.NewService(gr, views); err != nil { 329 log.Println("failed to initialize metrics:", err) 330 } else { 331 mux.Handle("/metrics", ms) 332 defer ms.Stop() 333 } 334 335 dialOpts := []grpc.DialOption{ 336 grpc.WithBlock(), 337 grpc.WithTimeout(10 * time.Second), 338 grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{NextProtos: []string{"h2"}})), 339 } 340 mServer := "maintner.golang.org:443" 341 cc, err := grpc.Dial(mServer, dialOpts...) 342 if err != nil { 343 log.Fatalf("unable to grpc.Dial(%q) = _, %s", mServer, err) 344 } 345 maintnerClient = apipb.NewMaintnerServiceClient(cc) 346 347 sshCA := mustRetrieveSSHCertificateAuthority() 348 349 var gomoteBucket string 350 var opts []grpc.ServerOption 351 if *buildEnvName == "" && *mode != "dev" && metadata.OnGCE() { 352 projectID, err := metadata.ProjectID() 353 if err != nil { 354 log.Fatalf("metadata.ProjectID() = %v", err) 355 } 356 env := buildenv.ByProjectID(projectID) 357 gomoteBucket = env.GomoteTransferBucket 358 var coordinatorBackend, serviceID = "coordinator-internal-iap", "" 359 if serviceID = env.IAPServiceID(coordinatorBackend); serviceID == "" { 360 log.Fatalf("unable to retrieve Service ID for backend service=%q", coordinatorBackend) 361 } 362 opts = append(opts, grpc.UnaryInterceptor(access.RequireIAPAuthUnaryInterceptor(access.IAPSkipAudienceValidation))) 363 opts = append(opts, grpc.StreamInterceptor(access.RequireIAPAuthStreamInterceptor(access.IAPSkipAudienceValidation))) 364 } 365 // grpcServer is a shared gRPC server. It is global, as it needs to be used in places that aren't factored otherwise. 366 grpcServer := grpc.NewServer(opts...) 367 368 var luciHTTPClient *http.Client 369 switch *mode { 370 case "prod": 371 var err error 372 luciHTTPClient, err = auth.NewAuthenticator(context.Background(), auth.SilentLogin, auth.Options{GCEAllowAsDefault: true}).Client() 373 if err != nil { 374 log.Fatalln("luci/auth.NewAuthenticator:", err) 375 } 376 case "dev": 377 var err error 378 luciHTTPClient, err = auth.NewAuthenticator(context.Background(), auth.SilentLogin, chromeinfra.DefaultAuthOptions()).Client() 379 if err != nil { 380 log.Fatalln("luci/auth.NewAuthenticator:", err) 381 } 382 } 383 buildersCl := buildbucketpb.NewBuildersClient(&prpc.Client{ 384 C: luciHTTPClient, 385 Host: "cr-buildbucket.appspot.com", 386 }) 387 buildsCl := buildbucketpb.NewBuildsClient(&prpc.Client{ 388 C: luciHTTPClient, 389 Host: "cr-buildbucket.appspot.com", 390 }) 391 luciPoll := lucipoll.NewService(maintnerClient, buildersCl, buildsCl) 392 dashV1 := legacydash.Handler(gce.GoDSClient(), maintnerClient, luciPoll, string(masterKey()), grpcServer) 393 dashV2 := &builddash.Handler{Datastore: gce.GoDSClient(), Maintner: maintnerClient, LUCI: luciPoll} 394 gs := &gRPCServer{dashboardURL: "https://build.golang.org"} 395 setSessionPool(sp) 396 gomoteServer := gomote.New(sp, sched, sshCA, gomoteBucket, mustStorageClient()) 397 protos.RegisterCoordinatorServer(grpcServer, gs) 398 gomoteprotos.RegisterGomoteServiceServer(grpcServer, gomoteServer) 399 mux.HandleFunc("/", grpcHandlerFunc(grpcServer, handleStatus)) // Serve a status page at farmer.golang.org. 400 mux.Handle("build.golang.org/", dashV1) // Serve a build dashboard at build.golang.org. 401 mux.Handle("build-staging.golang.org/", dashV1) 402 mux.HandleFunc("/builders", handleBuilders) 403 mux.HandleFunc("/temporarylogs", handleLogs) 404 mux.HandleFunc("/reverse", pool.HandleReverse) 405 mux.Handle("/revdial", revdial.ConnHandler()) 406 mux.HandleFunc("/style.css", handleStyleCSS) 407 mux.HandleFunc("/try", serveTryStatus(false)) 408 mux.HandleFunc("/try.json", serveTryStatus(true)) 409 mux.HandleFunc("/status/post-submit-active.json", handlePostSubmitActiveJSON) 410 mux.Handle("/dashboard", dashV2) 411 mux.HandleFunc("/queues", handleQueues) 412 if *mode == "dev" { 413 // TODO(crawshaw): do more in dev mode 414 gce.BuildletPool().SetEnabled(*devEnableGCE) 415 if *devEnableGCE || *devEnableEC2 { 416 go findWorkLoop() 417 } 418 } else { 419 go gce.BuildletPool().CleanUpOldVMs() 420 421 if gce.InStaging() { 422 dashboard.Builders = stagingClusterBuilders() 423 } 424 425 go listenAndServeInternalModuleProxy() 426 go findWorkLoop() 427 go findTryWorkLoop() 428 go reportReverseCountMetrics() 429 // TODO(cmang): gccgo will need its own findWorkLoop 430 } 431 432 ctx := context.Background() 433 configureSSHServer := func() (*remote.SSHServer, error) { 434 privateKey, publicKey, err := retrieveSSHKeys(ctx, sc, *mode) 435 if err != nil { 436 return nil, fmt.Errorf("unable to retrieve keys for SSH Server: %v", err) 437 } 438 return remote.NewSSHServer(*sshAddr, privateKey, publicKey, sshCA, sp) 439 } 440 sshServ, err := configureSSHServer() 441 if err != nil { 442 log.Printf("unable to configure SSH server: %s", err) 443 } else { 444 go func() { 445 log.Printf("running SSH server on %s", *sshAddr) 446 err := sshServ.ListenAndServe() 447 log.Printf("SSH server ended with error: %v", err) 448 }() 449 defer func() { 450 err := sshServ.Close() 451 if err != nil { 452 log.Printf("unable to close SSH server: %s", err) 453 } 454 }() 455 } 456 if *mode == "dev" { 457 // Use hostPathHandler in local development mode (only) to improve 458 // convenience of testing multiple domains that coordinator serves. 459 log.Fatalln(https.ListenAndServe(context.Background(), hostPathHandler(mux))) 460 } 461 log.Fatalln(https.ListenAndServe(context.Background(), mux)) 462 } 463 464 // ignoreAllNewWork, when true, prevents addWork from doing anything. 465 // It's sometimes set in staging mode when people are debugging 466 // certain paths. 467 var ignoreAllNewWork bool 468 469 // addWorkTestHook is optionally set by tests. 470 var addWorkTestHook func(buildgo.BuilderRev, commitDetail) 471 472 type commitDetail struct { 473 // RevCommitTime is always the git committer time of the associated 474 // BuilderRev.Rev. 475 RevCommitTime time.Time 476 477 // SubRevCommitTime is always the git committer time of the associated 478 // BuilderRev.SubRev, if it exists. Otherwise, it's the zero value. 479 SubRevCommitTime time.Time 480 481 // Branch for BuilderRev.Rev. 482 RevBranch string 483 484 // Branch for BuilderRev.SubRev, if it exists. 485 SubRevBranch string 486 487 // AuthorId is the gerrit-internal ID for the commit author, if 488 // available. For sub-repo trybots, this is the author of the 489 // commit from the trybot CL. 490 AuthorId int64 491 492 // AuthorEmail is the commit author from Gerrit, if available. 493 // For sub-repo trybots, this is the author of the 494 // commit from the trybot CL. 495 AuthorEmail string 496 } 497 498 // addWorkDetail adds some work to (maybe) do, if it's not already 499 // enqueued and the builders are configured to run the given repo. The 500 // detail argument is optional and used for scheduling. It's currently 501 // only used for post-submit builds. 502 func addWorkDetail(work buildgo.BuilderRev, detail commitDetail) { 503 if f := addWorkTestHook; f != nil { 504 f(work, detail) 505 return 506 } 507 if ignoreAllNewWork || isBuilding(work) { 508 return 509 } 510 if !mayBuildRev(work) { 511 if pool.NewGCEConfiguration().InStaging() { 512 if _, ok := dashboard.Builders[work.Name]; ok && logCantBuildStaging.Allow() { 513 log.Printf("may not build %v; skipping", work) 514 } 515 } 516 return 517 } 518 st, err := newBuild(work, detail) 519 if err != nil { 520 log.Printf("Bad build work params %v: %v", work, err) 521 return 522 } 523 st.start() 524 } 525 526 func stagingClusterBuilders() map[string]*dashboard.BuildConfig { 527 m := map[string]*dashboard.BuildConfig{} 528 for _, name := range []string{ 529 "linux-amd64", 530 "linux-amd64-sid", 531 "linux-amd64-clang", 532 "js-wasm-node18", 533 } { 534 if c, ok := dashboard.Builders[name]; ok { 535 m[name] = c 536 } else { 537 panic(fmt.Sprintf("unknown builder %q", name)) 538 } 539 } 540 541 // Also permit all the reverse buildlets: 542 for name, bc := range dashboard.Builders { 543 if bc.IsReverse() { 544 m[name] = bc 545 } 546 } 547 return m 548 } 549 550 func numCurrentBuilds() int { 551 statusMu.Lock() 552 defer statusMu.Unlock() 553 return len(status) 554 } 555 556 func isBuilding(work buildgo.BuilderRev) bool { 557 statusMu.Lock() 558 defer statusMu.Unlock() 559 _, building := status[work] 560 return building 561 } 562 563 var ( 564 logUnknownBuilder = rate.NewLimiter(rate.Every(5*time.Second), 2) 565 logCantBuildStaging = rate.NewLimiter(rate.Every(1*time.Second), 2) 566 ) 567 568 // mayBuildRev reports whether the build type & revision should be started. 569 // It returns true if it's not already building, and if a reverse buildlet is 570 // required, if an appropriate machine is registered. 571 func mayBuildRev(rev buildgo.BuilderRev) bool { 572 if isBuilding(rev) { 573 return false 574 } 575 if rev.SubName != "" { 576 // Don't build repos we don't know about, 577 // so importPathOfRepo won't panic later. 578 if r, ok := repos.ByGerritProject[rev.SubName]; !ok || r.ImportPath == "" || !r.CoordinatorCanBuild { 579 return false 580 } 581 } 582 buildConf, ok := dashboard.Builders[rev.Name] 583 if !ok { 584 if logUnknownBuilder.Allow() { 585 log.Printf("unknown builder %q", rev.Name) 586 } 587 return false 588 } 589 gceBuildEnv := pool.NewGCEConfiguration().BuildEnv() 590 if gceBuildEnv.MaxBuilds > 0 && numCurrentBuilds() >= gceBuildEnv.MaxBuilds { 591 return false 592 } 593 if buildConf.IsReverse() && !pool.ReversePool().CanBuild(buildConf.HostType) { 594 return false 595 } 596 return true 597 } 598 599 func setStatus(work buildgo.BuilderRev, st *buildStatus) { 600 statusMu.Lock() 601 defer statusMu.Unlock() 602 // TODO: panic if status[work] already exists. audit all callers. 603 // For instance, what if a trybot is running, and then the CL is merged 604 // and the findWork goroutine picks it up and it has the same commit, 605 // because it didn't need to be rebased in Gerrit's cherrypick? 606 // Could we then have two running with the same key? 607 status[work] = st 608 } 609 610 func markDone(work buildgo.BuilderRev) { 611 statusMu.Lock() 612 defer statusMu.Unlock() 613 st, ok := status[work] 614 if !ok { 615 return 616 } 617 delete(status, work) 618 if len(statusDone) == maxStatusDone { 619 copy(statusDone, statusDone[1:]) 620 statusDone = statusDone[:len(statusDone)-1] 621 } 622 statusDone = append(statusDone, st) 623 } 624 625 // statusPtrStr disambiguates which status to return if there are 626 // multiple in the history (e.g. recent failures where the build 627 // didn't finish for reasons outside of all.bash failing) 628 func getStatus(work buildgo.BuilderRev, statusPtrStr string) *buildStatus { 629 statusMu.Lock() 630 defer statusMu.Unlock() 631 match := func(st *buildStatus) bool { 632 return statusPtrStr == "" || fmt.Sprintf("%p", st) == statusPtrStr 633 } 634 if st, ok := status[work]; ok && match(st) { 635 return st 636 } 637 for _, st := range statusDone { 638 if st.BuilderRev == work && match(st) { 639 return st 640 } 641 } 642 for k, ts := range tries { 643 if k.Commit == work.Rev { 644 ts.mu.Lock() 645 for _, st := range ts.builds { 646 if st.BuilderRev == work && match(st) { 647 ts.mu.Unlock() 648 return st 649 } 650 } 651 ts.mu.Unlock() 652 } 653 } 654 return nil 655 } 656 657 // cancelOnePostSubmitBuildWithHostType tries to cancel one 658 // post-submit (non trybot) build with the provided host type and 659 // reports whether it did so. 660 // 661 // It currently selects the one that's been running the least amount 662 // of time, but that's not guaranteed. 663 func cancelOnePostSubmitBuildWithHostType(hostType string) bool { 664 statusMu.Lock() 665 defer statusMu.Unlock() 666 var best *buildStatus 667 for _, st := range status { 668 if st.isTry() || st.conf.HostType != hostType { 669 continue 670 } 671 if best == nil || st.startTime.After(best.startTime) { 672 best = st 673 } 674 } 675 if best != nil { 676 go best.cancelBuild() 677 } 678 return best != nil 679 } 680 681 type byAge []*buildStatus 682 683 func (s byAge) Len() int { return len(s) } 684 func (s byAge) Less(i, j int) bool { return s[i].startTime.Before(s[j].startTime) } 685 func (s byAge) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 686 687 func serveTryStatus(json bool) http.HandlerFunc { 688 return func(w http.ResponseWriter, r *http.Request) { 689 ts := trySetOfCommitPrefix(r.FormValue("commit")) 690 var tss trySetState 691 if ts != nil { 692 ts.mu.Lock() 693 tss = ts.trySetState.clone() 694 ts.mu.Unlock() 695 } 696 if json { 697 serveTryStatusJSON(w, r, ts, tss) 698 return 699 } 700 serveTryStatusHTML(w, ts, tss) 701 } 702 } 703 704 // tss is a clone that does not require ts' lock. 705 func serveTryStatusJSON(w http.ResponseWriter, r *http.Request, ts *trySet, tss trySetState) { 706 w.Header().Set("Access-Control-Allow-Origin", "*") 707 if r.Method == "OPTIONS" { 708 // This is likely a pre-flight CORS request. 709 return 710 } 711 var resp struct { 712 Success bool `json:"success"` 713 Error string `json:"error,omitempty"` 714 Payload interface{} `json:"payload,omitempty"` 715 } 716 if ts == nil { 717 var buf bytes.Buffer 718 resp.Error = "TryBot result not found (already done, invalid, or not yet discovered from Gerrit). Check Gerrit for results." 719 if err := json.NewEncoder(&buf).Encode(resp); err != nil { 720 http.Error(w, err.Error(), http.StatusInternalServerError) 721 return 722 } 723 w.Header().Set("Content-Type", "application/json") 724 w.WriteHeader(http.StatusNotFound) 725 w.Write(buf.Bytes()) 726 return 727 } 728 type litebuild struct { 729 Name string `json:"name"` 730 StartTime time.Time `json:"startTime"` 731 Done bool `json:"done"` 732 Succeeded bool `json:"succeeded"` 733 } 734 var result struct { 735 ChangeID string `json:"changeId"` 736 Commit string `json:"commit"` 737 Builds []litebuild `json:"builds"` 738 } 739 result.Commit = ts.Commit 740 result.ChangeID = ts.ChangeID 741 742 for _, bs := range tss.builds { 743 var lb litebuild 744 bs.mu.Lock() 745 lb.Name = bs.Name 746 lb.StartTime = bs.startTime 747 if !bs.done.IsZero() { 748 lb.Done = true 749 lb.Succeeded = bs.succeeded 750 } 751 bs.mu.Unlock() 752 result.Builds = append(result.Builds, lb) 753 } 754 resp.Success = true 755 resp.Payload = result 756 var buf bytes.Buffer 757 if err := json.NewEncoder(&buf).Encode(resp); err != nil { 758 log.Printf("Could not encode JSON response: %v", err) 759 http.Error(w, "error encoding JSON", http.StatusInternalServerError) 760 return 761 } 762 w.Header().Set("Content-Type", "application/json") 763 w.Write(buf.Bytes()) 764 } 765 766 // Styles unique to the trybot status page. 767 const tryStatusCSS = ` 768 <style> 769 p { 770 line-height: 1.15em; 771 } 772 773 table { 774 font-size: 11pt; 775 } 776 777 .nobr { 778 white-space: nowrap; 779 } 780 781 </style> 782 ` 783 784 // tss is a clone that does not require ts' lock. 785 func serveTryStatusHTML(w http.ResponseWriter, ts *trySet, tss trySetState) { 786 if ts == nil { 787 http.Error(w, "TryBot result not found (already done, invalid, or not yet discovered from Gerrit). Check Gerrit for results.", http.StatusNotFound) 788 return 789 } 790 buf := new(bytes.Buffer) 791 w.Header().Set("Content-Type", "text/html; charset=utf-8") 792 buf.WriteString("<!DOCTYPE html><head><title>trybot status</title>") 793 buf.WriteString(`<link rel="stylesheet" href="/style.css"/>`) 794 buf.WriteString(tryStatusCSS) 795 buf.WriteString("</head><body>") 796 fmt.Fprintf(buf, "[<a href='/'>homepage</a>] > %s\n", ts.ChangeID) 797 fmt.Fprintf(buf, "<h1>Trybot Status</h1>") 798 fmt.Fprintf(buf, "<p>Change-ID: <a href='https://go-review.googlesource.com/#/q/%s'>%s</a><br />\n", ts.ChangeID, ts.ChangeID) 799 fmt.Fprintf(buf, "Commit: <a href='https://go-review.googlesource.com/#/q/%s'>%s</a></p>\n", ts.Commit, ts.Commit) 800 fmt.Fprintf(buf, "<p>Builds remaining: %d</p>\n", tss.remain) 801 fmt.Fprintf(buf, "<h4>Builds</h4>\n") 802 fmt.Fprintf(buf, "<table cellpadding=5 border=0>\n") 803 for _, bs := range tss.builds { 804 var status string 805 bs.mu.Lock() 806 if !bs.done.IsZero() { 807 if bs.succeeded { 808 status = "pass" 809 } else { 810 status = "<b>FAIL</b>" 811 } 812 } else { 813 status = fmt.Sprintf("<i>running</i> %s", time.Since(bs.startTime).Round(time.Second)) 814 } 815 if u := bs.logURL; u != "" { 816 status = fmt.Sprintf(`<a href="%s">%s</a>`, html.EscapeString(u), status) 817 } 818 bs.mu.Unlock() 819 fmt.Fprintf(buf, "<tr><td class=\"nobr\">• %s</td><td>%s</td></tr>\n", 820 html.EscapeString(bs.NameAndBranch()), status) 821 } 822 fmt.Fprintf(buf, "</table>\n") 823 fmt.Fprintf(buf, "<h4>Full Detail</h4><table cellpadding=5 border=1>\n") 824 for _, bs := range tss.builds { 825 status := "<i>(running)</i>" 826 bs.mu.Lock() 827 if !bs.done.IsZero() { 828 if bs.succeeded { 829 status = "pass" 830 } else { 831 status = "<b>FAIL</b>" 832 } 833 } 834 bs.mu.Unlock() 835 fmt.Fprintf(buf, "<tr valign=top><td align=left>%s</td><td align=center>%s</td><td><pre>%s</pre></td></tr>\n", 836 html.EscapeString(bs.NameAndBranch()), 837 status, 838 bs.HTMLStatusTruncated()) 839 } 840 fmt.Fprintf(buf, "</table>") 841 w.Write(buf.Bytes()) 842 } 843 844 func trySetOfCommitPrefix(commitPrefix string) *trySet { 845 if commitPrefix == "" { 846 return nil 847 } 848 statusMu.Lock() 849 defer statusMu.Unlock() 850 for k, ts := range tries { 851 if strings.HasPrefix(k.Commit, commitPrefix) { 852 return ts 853 } 854 } 855 return nil 856 } 857 858 func handleLogs(w http.ResponseWriter, r *http.Request) { 859 br := buildgo.BuilderRev{ 860 Name: r.FormValue("name"), 861 Rev: r.FormValue("rev"), 862 SubName: r.FormValue("subName"), // may be empty 863 SubRev: r.FormValue("subRev"), // may be empty 864 } 865 st := getStatus(br, r.FormValue("st")) 866 if st == nil { 867 http.NotFound(w, r) 868 return 869 } 870 w.Header().Set("Content-Type", "text/plain; charset=utf-8") 871 w.Header().Set("X-Content-Type-Options", "nosniff") 872 writeStatusHeader(w, st) 873 874 nostream := r.FormValue("nostream") != "" 875 if nostream || !st.isRunning() { 876 if nostream { 877 fmt.Fprintf(w, "\n\n(live streaming disabled; reload manually to see status)\n") 878 } 879 w.Write(st.output.Bytes()) 880 return 881 } 882 883 if !st.hasEvent("make_and_test") && !st.hasEvent("make_cross_compile_kube") { 884 fmt.Fprintf(w, "\n\n(buildlet still starting; no live streaming. reload manually to see status)\n") 885 return 886 } 887 888 w.(http.Flusher).Flush() 889 890 output := st.output.Reader() 891 go func() { 892 <-r.Context().Done() 893 output.Close() 894 }() 895 buf := make([]byte, 65536) 896 for { 897 n, err := output.Read(buf) 898 if _, err2 := w.Write(buf[:n]); err2 != nil { 899 return 900 } 901 w.(http.Flusher).Flush() 902 if err != nil { 903 break 904 } 905 } 906 } 907 908 func writeStatusHeader(w http.ResponseWriter, st *buildStatus) { 909 st.mu.Lock() 910 defer st.mu.Unlock() 911 fmt.Fprintf(w, " builder: %s\n", st.Name) 912 fmt.Fprintf(w, " rev: %s\n", st.Rev) 913 workaroundFlush(w) 914 fmt.Fprintf(w, " buildlet: %s\n", st.bc) 915 fmt.Fprintf(w, " started: %v\n", st.startTime) 916 done := !st.done.IsZero() 917 if done { 918 fmt.Fprintf(w, " ended: %v\n", st.done) 919 fmt.Fprintf(w, " success: %v\n", st.succeeded) 920 } else { 921 fmt.Fprintf(w, " status: still running\n") 922 } 923 if len(st.events) > 0 { 924 io.WriteString(w, "\nEvents:\n") 925 st.writeEventsLocked(w, false, 0) 926 } 927 io.WriteString(w, "\nBuild log:\n") 928 workaroundFlush(w) 929 } 930 931 // workaroundFlush is an unnecessary flush to work around a bug in Chrome. 932 // See https://code.google.com/p/chromium/issues/detail?id=2016 for the details. 933 // In summary: a couple unnecessary chunk flushes bypass the content type 934 // sniffing which happen (even if unused?), even if you set nosniff as we do 935 // in func handleLogs. 936 func workaroundFlush(w http.ResponseWriter) { 937 w.(http.Flusher).Flush() 938 } 939 940 // findWorkLoop polls https://build.golang.org/?mode=json looking for 941 // new post-submit work for the main dashboard. It does not support 942 // gccgo. This is separate from trybots, which populates its work from 943 // findTryWorkLoop. 944 func findWorkLoop() { 945 // TODO: remove this hard-coded 15 second ticker and instead 946 // do some new streaming gRPC call to maintnerd to subscribe 947 // to new commits. 948 ticker := time.NewTicker(15 * time.Second) 949 // We wait for the ticker first, before looking for work, to 950 // give findTryWork a head start. Because try work is more 951 // important and the scheduler can't (yet?) preempt an 952 // existing post-submit build to take it over for a trybot, we 953 // want to make sure that reverse buildlets get assigned to 954 // trybots/slowbots first on start-up. 955 for range ticker.C { 956 if err := findWork(); err != nil { 957 log.Printf("failed to find new work: %v", err) 958 } 959 } 960 } 961 962 // findWork polls the https://build.golang.org/ dashboard once to find 963 // post-submit work to do. It's called in a loop by findWorkLoop. 964 func findWork() error { 965 var bs types.BuildStatus 966 if err := dash("GET", "", url.Values{ 967 "mode": {"json"}, 968 "branch": {"mixed"}, 969 }, nil, &bs); err != nil { 970 return err 971 } 972 knownToDashboard := map[string]bool{} // keys are builder 973 for _, b := range bs.Builders { 974 knownToDashboard[b] = true 975 } 976 977 var goRevisions []string // revisions of repo "go", branch "master" 978 var goRevisionsTypeParams []string // revisions of repo "go", branch "dev.typeparams" golang.org/issue/46786 and golang.org/issue/46864 979 seenSubrepo := make(map[string]bool) 980 commitTime := make(map[string]string) // git rev => "2019-11-20T22:54:54Z" (time.RFC3339 from build.golang.org's JSON) 981 commitBranch := make(map[string]string) // git rev => "master" 982 983 add := func(br buildgo.BuilderRev) { 984 var d commitDetail 985 var err error 986 if revCommitTime := commitTime[br.Rev]; revCommitTime != "" { 987 d.RevCommitTime, err = time.Parse(time.RFC3339, revCommitTime) 988 if err != nil { 989 // Log the error, but ignore it. We can tolerate the lack of a commit time. 990 log.Printf("failure parsing commit time %q for %q: %v", revCommitTime, br.Rev, err) 991 } 992 } 993 d.RevBranch = commitBranch[br.Rev] 994 if br.SubRev != "" { 995 if subRevCommitTime := commitTime[br.SubRev]; subRevCommitTime != "" { 996 d.SubRevCommitTime, err = time.Parse(time.RFC3339, subRevCommitTime) 997 if err != nil { 998 // Log the error, but ignore it. We can tolerate the lack of a commit time. 999 log.Printf("failure parsing commit time %q for %q: %v", subRevCommitTime, br.SubRev, err) 1000 } 1001 } 1002 d.SubRevBranch = commitBranch[br.SubRev] 1003 } 1004 addWorkDetail(br, d) 1005 } 1006 1007 for _, br := range bs.Revisions { 1008 if r, ok := repos.ByGerritProject[br.Repo]; !ok || !r.CoordinatorCanBuild { 1009 continue 1010 } 1011 if br.Repo == "grpc-review" { 1012 // Skip the grpc repo. It's only for reviews 1013 // for now (using LetsUseGerrit). 1014 continue 1015 } 1016 commitTime[br.Revision] = br.Date 1017 commitBranch[br.Revision] = br.Branch 1018 awaitSnapshot := false 1019 if br.Repo == "go" { 1020 if br.Branch == "master" { 1021 goRevisions = append(goRevisions, br.Revision) 1022 } else if br.Branch == "dev.typeparams" { 1023 goRevisionsTypeParams = append(goRevisionsTypeParams, br.Revision) 1024 } 1025 } else { 1026 // If this is the first time we've seen this sub-repo 1027 // in this loop, then br.GoRevision is the go repo 1028 // HEAD. To save resources, we only build subrepos 1029 // against HEAD once we have a snapshot. 1030 // The next time we see this sub-repo in this loop, the 1031 // GoRevision is one of the release branches, for which 1032 // we may not have a snapshot (if the release was made 1033 // a long time before this builder came up), so skip 1034 // the snapshot check. 1035 awaitSnapshot = !seenSubrepo[br.Repo] 1036 seenSubrepo[br.Repo] = true 1037 } 1038 1039 if len(br.Results) != len(bs.Builders) { 1040 return errors.New("bogus JSON response from dashboard: results is too long.") 1041 } 1042 for i, res := range br.Results { 1043 if res != "" { 1044 // It's either "ok" or a failure URL. 1045 continue 1046 } 1047 builder := bs.Builders[i] 1048 builderInfo, ok := dashboard.Builders[builder] 1049 if !ok { 1050 // Not managed by the coordinator. 1051 continue 1052 } 1053 if !builderInfo.BuildsRepoPostSubmit(br.Repo, br.Branch, br.GoBranch) { 1054 continue 1055 } 1056 var rev buildgo.BuilderRev 1057 if br.Repo == "go" { 1058 rev = buildgo.BuilderRev{ 1059 Name: builder, 1060 Rev: br.Revision, 1061 } 1062 } else { 1063 rev = buildgo.BuilderRev{ 1064 Name: builder, 1065 Rev: br.GoRevision, 1066 SubName: br.Repo, 1067 SubRev: br.Revision, 1068 } 1069 if awaitSnapshot && 1070 // If this is a builder that snapshots after 1071 // make.bash but the snapshot doesn't yet exist, 1072 // then skip. But some builders on slow networks 1073 // don't snapshot, so don't wait for them. They'll 1074 // need to run make.bash first for x/ repos tests. 1075 !builderInfo.SkipSnapshot && !rev.SnapshotExists(context.TODO(), pool.NewGCEConfiguration().BuildEnv()) { 1076 continue 1077 } 1078 } 1079 add(rev) 1080 } 1081 } 1082 1083 // And to bootstrap new builders, see if we have any builders 1084 // that the dashboard doesn't know about. 1085 for b, builderInfo := range dashboard.Builders { 1086 if knownToDashboard[b] { 1087 // no need to bootstrap. 1088 continue 1089 } 1090 if builderInfo.BuildsRepoPostSubmit("go", "master", "master") { 1091 for _, rev := range goRevisions { 1092 add(buildgo.BuilderRev{Name: b, Rev: rev}) 1093 } 1094 } else if builderInfo.BuildsRepoPostSubmit("go", "dev.typeparams", "dev.typeparams") { 1095 // schedule builds on dev.typeparams branch 1096 // golang.org/issue/46786 and golang.org/issue/46864 1097 for _, rev := range goRevisionsTypeParams { 1098 add(buildgo.BuilderRev{Name: b, Rev: rev}) 1099 } 1100 } 1101 } 1102 return nil 1103 } 1104 1105 // findTryWorkLoop is a goroutine which loops periodically and queries 1106 // Gerrit for TryBot work. 1107 func findTryWorkLoop() { 1108 if pool.NewGCEConfiguration().TryDepsErr() != nil { 1109 return 1110 } 1111 ticker := time.NewTicker(1 * time.Second) 1112 for { 1113 if err := findTryWork(); err != nil { 1114 log.Printf("failed to find trybot work: %v", err) 1115 } 1116 <-ticker.C 1117 } 1118 } 1119 1120 func findTryWork() error { 1121 isStaging := pool.NewGCEConfiguration().InStaging() 1122 if isStaging && !stagingTryWork { 1123 return nil 1124 } 1125 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) // should be milliseconds 1126 defer cancel() 1127 tryRes, err := maintnerClient.GoFindTryWork(ctx, &apipb.GoFindTryWorkRequest{ForStaging: isStaging}) 1128 if err != nil { 1129 return err 1130 } 1131 1132 now := time.Now() 1133 1134 statusMu.Lock() 1135 defer statusMu.Unlock() 1136 1137 tryList = tryList[:0] 1138 for _, work := range tryRes.Waiting { 1139 if work.ChangeId == "" || work.Commit == "" { 1140 log.Printf("Warning: skipping incomplete %#v", work) 1141 continue 1142 } 1143 if r, ok := repos.ByGerritProject[work.Project]; !ok || !r.CoordinatorCanBuild { 1144 continue 1145 } 1146 key := tryWorkItemKey(work) 1147 tryList = append(tryList, key) 1148 if ts, ok := tries[key]; ok { 1149 // already in progress 1150 ts.wantedAsOf = now 1151 continue 1152 } else { 1153 ts := newTrySet(work) 1154 ts.wantedAsOf = now 1155 tries[key] = ts 1156 } 1157 } 1158 for k, ts := range tries { 1159 if ts.wantedAsOf != now { 1160 delete(tries, k) 1161 go ts.cancelBuilds() 1162 } 1163 } 1164 return nil 1165 } 1166 1167 type tryKey struct { 1168 Project string // "go", "net", etc 1169 Branch string // master 1170 ChangeID string // I1a27695838409259d1586a0adfa9f92bccf7ceba 1171 Commit string // ecf3dffc81dc21408fb02159af352651882a8383 1172 } 1173 1174 // ChangeTriple returns the Gerrit (project, branch, change-ID) triple 1175 // uniquely identifying this change. Several Gerrit APIs require this 1176 // form of if there are multiple changes with the same Change-ID. 1177 func (k *tryKey) ChangeTriple() string { 1178 return fmt.Sprintf("%s~%s~%s", k.Project, k.Branch, k.ChangeID) 1179 } 1180 1181 // trySet is a the state of a set of builds of different 1182 // configurations, all for the same (Change-ID, Commit) pair. The 1183 // sets which are still wanted (not already submitted or canceled) are 1184 // stored in the global 'tries' map. 1185 type trySet struct { 1186 // immutable 1187 tryKey 1188 tryID string // "T" + 9 random hex 1189 slowBots []*dashboard.BuildConfig // any opt-in slower builders to run in a trybot run 1190 xrepos []*buildStatus // any opt-in x/ repo builds to run in a trybot run 1191 1192 // wantedAsOf is guarded by statusMu and is used by 1193 // findTryWork. It records the last time this tryKey was still 1194 // wanted. 1195 wantedAsOf time.Time 1196 1197 // mu guards the following fields. 1198 // See LOCK ORDER comment above. 1199 mu sync.Mutex 1200 canceled bool // try run is no longer wanted and its builds were canceled 1201 trySetState 1202 errMsg bytes.Buffer 1203 } 1204 1205 type trySetState struct { 1206 remain int 1207 failed []string // builder names, with optional " ($branch)" suffix 1208 builds []*buildStatus 1209 } 1210 1211 func (ts trySetState) clone() trySetState { 1212 return trySetState{ 1213 remain: ts.remain, 1214 failed: append([]string(nil), ts.failed...), 1215 builds: append([]*buildStatus(nil), ts.builds...), 1216 } 1217 } 1218 1219 func tryWorkItemKey(work *apipb.GerritTryWorkItem) tryKey { 1220 return tryKey{ 1221 Project: work.Project, 1222 Branch: work.Branch, 1223 ChangeID: work.ChangeId, 1224 Commit: work.Commit, 1225 } 1226 } 1227 1228 var testingKnobSkipBuilds bool 1229 1230 // newTrySet creates a new trySet group of builders for a given 1231 // work item, the (Project, Branch, Change-ID, Commit) tuple. 1232 // It also starts goroutines for each build. 1233 // 1234 // Must hold statusMu. 1235 func newTrySet(work *apipb.GerritTryWorkItem) *trySet { 1236 goBranch := work.Branch 1237 var subBranch string // branch of subrepository, empty for main Go repo. 1238 if work.Project != "go" && len(work.GoBranch) > 0 { 1239 // work.GoBranch is non-empty when work.Project != "go", 1240 // so prefer work.GoBranch[0] over work.Branch for goBranch. 1241 goBranch = work.GoBranch[0] 1242 subBranch = work.Branch 1243 } 1244 tryBots := dashboard.TryBuildersForProject(work.Project, work.Branch, goBranch) 1245 slowBots, invalidSlowBots := slowBotsFromComments(work) 1246 builders := joinBuilders(tryBots, slowBots) 1247 1248 key := tryWorkItemKey(work) 1249 log.Printf("Starting new trybot set for %v (ignored invalid terms = %q)", key, invalidSlowBots) 1250 ts := &trySet{ 1251 tryKey: key, 1252 tryID: "T" + randHex(9), 1253 trySetState: trySetState{ 1254 builds: make([]*buildStatus, 0, len(builders)), 1255 }, 1256 slowBots: slowBots, 1257 } 1258 1259 // Defensive check that the input is well-formed. 1260 // Each GoCommit should have a GoBranch and a GoVersion. 1261 // There should always be at least one GoVersion. 1262 if len(work.GoBranch) < len(work.GoCommit) { 1263 log.Printf("WARNING: len(GoBranch) of %d != len(GoCommit) of %d", len(work.GoBranch), len(work.GoCommit)) 1264 work.GoCommit = work.GoCommit[:len(work.GoBranch)] 1265 } 1266 if len(work.GoVersion) < len(work.GoCommit) { 1267 log.Printf("WARNING: len(GoVersion) of %d != len(GoCommit) of %d", len(work.GoVersion), len(work.GoCommit)) 1268 work.GoCommit = work.GoCommit[:len(work.GoVersion)] 1269 } 1270 if len(work.GoVersion) == 0 { 1271 log.Print("WARNING: len(GoVersion) is zero, want at least one") 1272 work.GoVersion = []*apipb.MajorMinor{{}} 1273 } 1274 1275 addBuilderToSet := func(bs *buildStatus, brev buildgo.BuilderRev) { 1276 bs.trySet = ts 1277 status[brev] = bs 1278 1279 idx := len(ts.builds) 1280 ts.builds = append(ts.builds, bs) 1281 ts.remain++ 1282 if testingKnobSkipBuilds { 1283 return 1284 } 1285 go bs.start() // acquires statusMu itself, so in a goroutine 1286 go ts.awaitTryBuild(idx, bs, brev) 1287 } 1288 1289 var mainBuildGoCommit string 1290 if key.Project != "go" && len(work.GoCommit) > 0 { 1291 // work.GoCommit is non-empty when work.Project != "go". 1292 // For the main build, use the first GoCommit, which represents Go tip (master branch). 1293 mainBuildGoCommit = work.GoCommit[0] 1294 } 1295 1296 // Start the main TryBot build using the selected builders. 1297 // There may be additional builds, those are handled below. 1298 if !testingKnobSkipBuilds { 1299 go ts.notifyStarting(invalidSlowBots) 1300 } 1301 for _, bconf := range builders { 1302 goVersion := types.MajorMinor{Major: int(work.GoVersion[0].Major), Minor: int(work.GoVersion[0].Minor)} 1303 if goVersion.Less(bconf.MinimumGoVersion) { 1304 continue 1305 } 1306 brev := tryKeyToBuilderRev(bconf.Name, key, mainBuildGoCommit) 1307 bs, err := newBuild(brev, commitDetail{RevBranch: goBranch, SubRevBranch: subBranch, AuthorEmail: work.AuthorEmail}) 1308 if err != nil { 1309 log.Printf("can't create build for %q: %v", brev, err) 1310 continue 1311 } 1312 addBuilderToSet(bs, brev) 1313 } 1314 1315 // If this is a golang.org/x repo and there's more than one GoCommit, 1316 // that means we're testing against prior releases of Go too. 1317 // The version selection logic is currently in maintapi's GoFindTryWork implementation. 1318 if key.Project != "go" && len(work.GoCommit) >= 2 { 1319 // linuxBuilder is the standard builder for this purpose. 1320 linuxBuilder := dashboard.Builders["linux-amd64"] 1321 1322 for i, goRev := range work.GoCommit { 1323 if i == 0 { 1324 // Skip the i==0 element, which was already handled above. 1325 continue 1326 } 1327 branch := work.GoBranch[i] 1328 if !linuxBuilder.BuildsRepoTryBot(key.Project, "master", branch) { 1329 continue 1330 } 1331 goVersion := types.MajorMinor{Major: int(work.GoVersion[i].Major), Minor: int(work.GoVersion[i].Minor)} 1332 if goVersion.Less(linuxBuilder.MinimumGoVersion) { 1333 continue 1334 } 1335 brev := tryKeyToBuilderRev(linuxBuilder.Name, key, goRev) 1336 bs, err := newBuild(brev, commitDetail{RevBranch: branch, SubRevBranch: subBranch, AuthorEmail: work.AuthorEmail}) 1337 if err != nil { 1338 log.Printf("can't create build for %q: %v", brev, err) 1339 continue 1340 } 1341 addBuilderToSet(bs, brev) 1342 } 1343 } 1344 1345 // For the Go project on the "master" branch, 1346 // use the TRY= syntax to test against x repos. 1347 if branch := key.Branch; key.Project == "go" && branch == "master" { 1348 // customBuilder optionally specifies the builder to use for the build 1349 // (empty string means to use the default builder). 1350 addXrepo := func(project, customBuilder string) *buildStatus { 1351 // linux-amd64 is the default builder as it is the fastest and least 1352 // expensive. 1353 builder := dashboard.Builders["linux-amd64"] 1354 if customBuilder != "" { 1355 b, ok := dashboard.Builders[customBuilder] 1356 if !ok { 1357 log.Printf("can't resolve requested builder %q", customBuilder) 1358 return nil 1359 } 1360 builder = b 1361 } 1362 1363 if testingKnobSkipBuilds { 1364 return nil 1365 } 1366 if !builder.BuildsRepoPostSubmit(project, branch, branch) { 1367 log.Printf("builder %q isn't configured to build %q@%q", builder.Name, project, branch) 1368 return nil 1369 } 1370 rev, err := getRepoHead(project) 1371 if err != nil { 1372 log.Printf("can't determine repo head for %q: %v", project, err) 1373 return nil 1374 } 1375 brev := buildgo.BuilderRev{ 1376 Name: builder.Name, 1377 Rev: work.Commit, 1378 SubName: project, 1379 SubRev: rev, 1380 } 1381 // getRepoHead always fetches master, so use that as the SubRevBranch. 1382 bs, err := newBuild(brev, commitDetail{RevBranch: branch, SubRevBranch: "master", AuthorEmail: work.AuthorEmail}) 1383 if err != nil { 1384 log.Printf("can't create x/%s trybot build for go/master commit %s: %v", project, rev, err) 1385 return nil 1386 } 1387 addBuilderToSet(bs, brev) 1388 return bs 1389 } 1390 1391 // First, add the opt-in x repos. 1392 repoBuilders := xReposFromComments(work) 1393 for rb := range repoBuilders { 1394 if bs := addXrepo(rb.Project, rb.Builder); bs != nil { 1395 ts.xrepos = append(ts.xrepos, bs) 1396 } 1397 } 1398 1399 // Always include the default x/tools builder. See golang.org/issue/34348. 1400 // Do not add it to the trySet's list of opt-in x repos, however. 1401 if haveDefaultToolsBuild := repoBuilders[xRepoAndBuilder{Project: "tools"}]; !haveDefaultToolsBuild { 1402 addXrepo("tools", "") 1403 } 1404 } 1405 1406 return ts 1407 } 1408 1409 // Note: called in some paths where statusMu is held; do not make RPCs. 1410 func tryKeyToBuilderRev(builder string, key tryKey, goRev string) buildgo.BuilderRev { 1411 // This function is called from within newTrySet, holding statusMu, s 1412 if key.Project == "go" { 1413 return buildgo.BuilderRev{ 1414 Name: builder, 1415 Rev: key.Commit, 1416 } 1417 } 1418 return buildgo.BuilderRev{ 1419 Name: builder, 1420 Rev: goRev, 1421 SubName: key.Project, 1422 SubRev: key.Commit, 1423 } 1424 } 1425 1426 // joinBuilders joins sets of builders into one set. 1427 // The resulting set contains unique builders sorted by name. 1428 func joinBuilders(sets ...[]*dashboard.BuildConfig) []*dashboard.BuildConfig { 1429 byName := make(map[string]*dashboard.BuildConfig) 1430 for _, set := range sets { 1431 for _, bc := range set { 1432 byName[bc.Name] = bc 1433 } 1434 } 1435 var all []*dashboard.BuildConfig 1436 for _, bc := range byName { 1437 all = append(all, bc) 1438 } 1439 sort.Slice(all, func(i, j int) bool { return all[i].Name < all[j].Name }) 1440 return all 1441 } 1442 1443 // state returns a copy of the trySet's state. 1444 func (ts *trySet) state() trySetState { 1445 ts.mu.Lock() 1446 defer ts.mu.Unlock() 1447 return ts.trySetState.clone() 1448 } 1449 1450 // tryBotsTag returns a Gerrit tag for the TryBots state s. See Issue 39828 and 1451 // https://gerrit-review.googlesource.com/Documentation/rest-api-changes.html#review-input. 1452 func tryBotsTag(s string) string { 1453 return "autogenerated:trybots~" + s 1454 } 1455 1456 func isTryBotsTag(s string) bool { 1457 return strings.HasPrefix(s, "autogenerated:trybots~") 1458 } 1459 1460 // A commentThread is a thread of Gerrit comments. 1461 type commentThread struct { 1462 // root is the first comment in the thread. 1463 root gerrit.CommentInfo 1464 // thread is a list of all the comments in the thread, including the root, 1465 // sorted chronologically. 1466 thread []gerrit.CommentInfo 1467 // unresolved is the thread unresolved state, based on the last comment. 1468 unresolved bool 1469 } 1470 1471 // listPatchSetThreads returns a list of PATCHSET_LEVEL comment threads, sorted 1472 // by the time at which they were started. 1473 func listPatchSetThreads(gerritClient *gerrit.Client, changeID string) ([]*commentThread, error) { 1474 comments, err := gerritClient.ListChangeComments(context.Background(), changeID) 1475 if err != nil { 1476 return nil, err 1477 } 1478 patchSetComments := comments["/PATCHSET_LEVEL"] 1479 if len(patchSetComments) == 0 { 1480 return nil, nil 1481 } 1482 1483 // The API doesn't sort comments chronologically, but "the state of 1484 // resolution of a comment thread is stored in the last comment in that 1485 // thread chronologically", so first of all sort them by time. 1486 sort.Slice(patchSetComments, func(i, j int) bool { 1487 return patchSetComments[i].Updated.Time().Before(patchSetComments[j].Updated.Time()) 1488 }) 1489 1490 // roots is a map of message IDs to their thread root. 1491 roots := make(map[string]string) 1492 threads := make(map[string]*commentThread) 1493 var result []*commentThread 1494 for _, c := range patchSetComments { 1495 if c.InReplyTo == "" { 1496 roots[c.ID] = c.ID 1497 threads[c.ID] = &commentThread{ 1498 root: c, 1499 thread: []gerrit.CommentInfo{c}, 1500 unresolved: *c.Unresolved, 1501 } 1502 if c.Unresolved != nil { 1503 threads[c.ID].unresolved = *c.Unresolved 1504 } 1505 result = append(result, threads[c.ID]) 1506 continue 1507 } 1508 1509 root, ok := roots[c.InReplyTo] 1510 if !ok { 1511 return nil, fmt.Errorf("%s has no parent", c.ID) 1512 } 1513 roots[c.ID] = root 1514 threads[root].thread = append(threads[root].thread, c) 1515 if c.Unresolved != nil { 1516 threads[root].unresolved = *c.Unresolved 1517 } 1518 } 1519 1520 return result, nil 1521 } 1522 1523 func (ts *trySet) statusPage() string { 1524 return "https://farmer.golang.org/try?commit=" + ts.Commit[:8] 1525 } 1526 1527 // notifyStarting runs in its own goroutine and posts to Gerrit that 1528 // the trybots have started on the user's CL with a link of where to watch. 1529 func (ts *trySet) notifyStarting(invalidSlowBots []string) { 1530 name := "TryBots" 1531 if len(ts.slowBots) > 0 { 1532 name = "SlowBots" 1533 } 1534 msg := name + " beginning. Status page: " + ts.statusPage() + "\n" 1535 1536 if len(invalidSlowBots) > 0 { 1537 msg += fmt.Sprintf("Note that the following SlowBot terms didn't match any existing builder name or slowbot alias: %s.\n", strings.Join(invalidSlowBots, ", ")) 1538 } 1539 1540 // If any of the requested SlowBot builders 1541 // have a known issue, give users a warning. 1542 for _, b := range ts.slowBots { 1543 if len(b.KnownIssues) > 0 { 1544 issueBlock := new(strings.Builder) 1545 fmt.Fprintf(issueBlock, "Note that builder %s has known issues:\n", b.Name) 1546 for _, i := range b.KnownIssues { 1547 fmt.Fprintf(issueBlock, "\thttps://go.dev/issue/%d\n", i) 1548 } 1549 msg += issueBlock.String() 1550 } 1551 } 1552 1553 unresolved := true 1554 ri := gerrit.ReviewInput{ 1555 Tag: tryBotsTag("beginning"), 1556 Comments: map[string][]gerrit.CommentInput{ 1557 "/PATCHSET_LEVEL": {{Message: msg, Unresolved: &unresolved}}, 1558 }, 1559 } 1560 1561 // Mark as resolved old TryBot threads that don't have human comments on them. 1562 gerritClient := pool.NewGCEConfiguration().GerritClient() 1563 if patchSetThreads, err := listPatchSetThreads(gerritClient, ts.ChangeTriple()); err == nil { 1564 for _, t := range patchSetThreads { 1565 if !t.unresolved { 1566 continue 1567 } 1568 hasHumanComments := false 1569 for _, c := range t.thread { 1570 if !isTryBotsTag(c.Tag) { 1571 hasHumanComments = true 1572 break 1573 } 1574 } 1575 if hasHumanComments { 1576 continue 1577 } 1578 unresolved := false 1579 ri.Comments["/PATCHSET_LEVEL"] = append(ri.Comments["/PATCHSET_LEVEL"], gerrit.CommentInput{ 1580 InReplyTo: t.root.ID, 1581 Message: "Superseded.", 1582 Unresolved: &unresolved, 1583 }) 1584 } 1585 } else { 1586 log.Printf("Error getting Gerrit threads on %s: %v", ts.ChangeTriple(), err) 1587 } 1588 1589 if err := gerritClient.SetReview(context.Background(), ts.ChangeTriple(), ts.Commit, ri); err != nil { 1590 log.Printf("Error leaving Gerrit comment on %s: %v", ts.Commit[:8], err) 1591 } 1592 } 1593 1594 // awaitTryBuild runs in its own goroutine and waits for a build in a 1595 // trySet to complete. 1596 // 1597 // If the build fails without getting to the end, it sleeps and 1598 // reschedules it, as long as it's still wanted. 1599 func (ts *trySet) awaitTryBuild(idx int, bs *buildStatus, brev buildgo.BuilderRev) { 1600 for { 1601 WaitCh: 1602 for { 1603 timeout := time.NewTimer(10 * time.Minute) 1604 select { 1605 case <-bs.ctx.Done(): 1606 timeout.Stop() 1607 break WaitCh 1608 case <-timeout.C: 1609 if !ts.wanted() { 1610 // Build was canceled. 1611 return 1612 } 1613 } 1614 } 1615 1616 if bs.hasEvent(eventDone) || bs.hasEvent(eventSkipBuildMissingDep) { 1617 ts.noteBuildComplete(bs) 1618 return 1619 } 1620 1621 // TODO(bradfitz): rethink this logic. we should only 1622 // start a new build if the old one appears dead or 1623 // hung. 1624 1625 // Sleep a bit and retry. 1626 time.Sleep(30 * time.Second) 1627 if !ts.wanted() { 1628 return 1629 } 1630 bs, _ = newBuild(brev, bs.commitDetail) 1631 bs.trySet = ts 1632 go bs.start() 1633 ts.mu.Lock() 1634 ts.builds[idx] = bs 1635 ts.mu.Unlock() 1636 } 1637 } 1638 1639 // wanted reports whether this trySet is still active. 1640 // 1641 // If the commit has been submitted, or change abandoned, or the 1642 // checkbox unchecked, wanted returns false. 1643 func (ts *trySet) wanted() bool { 1644 statusMu.Lock() 1645 defer statusMu.Unlock() 1646 _, ok := tries[ts.tryKey] 1647 return ok 1648 } 1649 1650 // cancelBuilds run in its own goroutine and cancels this trySet's 1651 // currently-active builds because they're no longer wanted. 1652 func (ts *trySet) cancelBuilds() { 1653 ts.mu.Lock() 1654 defer ts.mu.Unlock() 1655 1656 // Only cancel the builds once. And note that they're canceled so we 1657 // can avoid spamming Gerrit later if they come back as failed. 1658 if ts.canceled { 1659 return 1660 } 1661 ts.canceled = true 1662 1663 for _, bs := range ts.builds { 1664 go bs.cancelBuild() 1665 } 1666 } 1667 1668 func (ts *trySet) noteBuildComplete(bs *buildStatus) { 1669 bs.mu.Lock() 1670 var ( 1671 succeeded = bs.succeeded 1672 buildLog = bs.output.String() 1673 ) 1674 bs.mu.Unlock() 1675 1676 ts.mu.Lock() 1677 ts.remain-- 1678 remain := ts.remain 1679 if !succeeded { 1680 ts.failed = append(ts.failed, bs.NameAndBranch()) 1681 } 1682 numFail := len(ts.failed) 1683 canceled := ts.canceled 1684 ts.mu.Unlock() 1685 1686 if canceled { 1687 // Be quiet and don't spam Gerrit. 1688 return 1689 } 1690 1691 const failureFooter = "Consult https://build.golang.org/ to see whether they are new failures. Keep in mind that TryBots currently test *exactly* your git commit, without rebasing. If your commit's git parent is old, the failure might've already been fixed.\n" 1692 1693 s1 := sha1.New() 1694 io.WriteString(s1, buildLog) 1695 objName := fmt.Sprintf("%s/%s_%x.log", bs.Rev[:8], bs.Name, s1.Sum(nil)[:4]) 1696 wr, logURL := newBuildLogBlob(objName) 1697 if _, err := io.WriteString(wr, buildLog); err != nil { 1698 log.Printf("Failed to write to GCS: %v", err) 1699 return 1700 } 1701 if err := wr.Close(); err != nil { 1702 log.Printf("Failed to write to GCS: %v", err) 1703 return 1704 } 1705 1706 bs.mu.Lock() 1707 bs.logURL = logURL 1708 bs.mu.Unlock() 1709 1710 if !succeeded { 1711 ts.mu.Lock() 1712 fmt.Fprintf(&ts.errMsg, "Failed on %s: %s\n", bs.NameAndBranch(), logURL) 1713 ts.mu.Unlock() 1714 } 1715 1716 postInProgressMessage := !succeeded && numFail == 1 && remain > 0 1717 postFinishedMessage := remain == 0 1718 1719 if !postInProgressMessage && !postFinishedMessage { 1720 return 1721 } 1722 1723 var ( 1724 gerritMsg = &strings.Builder{} 1725 gerritTag string 1726 gerritScore int 1727 ) 1728 1729 if postInProgressMessage { 1730 fmt.Fprintf(gerritMsg, "Build is still in progress... "+ 1731 "Status page: https://farmer.golang.org/try?commit=%s\n"+ 1732 "Failed on %s: %s\n"+ 1733 "Other builds still in progress; subsequent failure notices suppressed until final report.\n\n"+ 1734 failureFooter, ts.Commit[:8], bs.NameAndBranch(), logURL) 1735 gerritTag = tryBotsTag("progress") 1736 } 1737 1738 if postFinishedMessage { 1739 name := "TryBots" 1740 if len(ts.slowBots) > 0 { 1741 name = "SlowBots" 1742 } 1743 1744 if numFail == 0 { 1745 gerritScore = 1 1746 fmt.Fprintf(gerritMsg, "%s are happy.\n", name) 1747 gerritTag = tryBotsTag("happy") 1748 } else { 1749 gerritScore = -1 1750 ts.mu.Lock() 1751 errMsg := ts.errMsg.String() 1752 ts.mu.Unlock() 1753 fmt.Fprintf(gerritMsg, "%d of %d %s failed.\n%s\n"+failureFooter, 1754 numFail, len(ts.builds), name, errMsg) 1755 gerritTag = tryBotsTag("failed") 1756 } 1757 fmt.Fprintln(gerritMsg) 1758 if len(ts.slowBots) > 0 { 1759 fmt.Fprintf(gerritMsg, "SlowBot builds that ran:\n") 1760 for _, c := range ts.slowBots { 1761 fmt.Fprintf(gerritMsg, "* %s\n", c.Name) 1762 } 1763 } 1764 if len(ts.xrepos) > 0 { 1765 fmt.Fprintf(gerritMsg, "Also tested the following repos:\n") 1766 for _, st := range ts.xrepos { 1767 fmt.Fprintf(gerritMsg, "* %s\n", st.NameAndBranch()) 1768 } 1769 } 1770 } 1771 1772 var inReplyTo string 1773 gerritClient := pool.NewGCEConfiguration().GerritClient() 1774 if patchSetThreads, err := listPatchSetThreads(gerritClient, ts.ChangeTriple()); err == nil { 1775 for _, t := range patchSetThreads { 1776 if t.root.Tag == tryBotsTag("beginning") && strings.Contains(t.root.Message, ts.statusPage()) { 1777 inReplyTo = t.root.ID 1778 } 1779 } 1780 } else { 1781 log.Printf("Error getting Gerrit threads on %s: %v", ts.ChangeTriple(), err) 1782 } 1783 1784 // Mark resolved if TryBots are happy. 1785 unresolved := gerritScore != 1 1786 1787 ri := gerrit.ReviewInput{ 1788 Tag: gerritTag, 1789 Comments: map[string][]gerrit.CommentInput{ 1790 "/PATCHSET_LEVEL": {{ 1791 InReplyTo: inReplyTo, 1792 Message: gerritMsg.String(), 1793 Unresolved: &unresolved, 1794 }}, 1795 }, 1796 } 1797 if gerritScore != 0 { 1798 ri.Labels = map[string]int{ 1799 "TryBot-Result": gerritScore, 1800 } 1801 } 1802 if err := gerritClient.SetReview(context.Background(), ts.ChangeTriple(), ts.Commit, ri); err != nil { 1803 log.Printf("Error leaving Gerrit comment on %s: %v", ts.Commit[:8], err) 1804 } 1805 } 1806 1807 // getBuildlets creates up to n buildlets and sends them on the returned channel 1808 // before closing the channel. 1809 func getBuildlets(ctx context.Context, n int, schedTmpl *queue.SchedItem, lg pool.Logger) <-chan buildlet.Client { 1810 ch := make(chan buildlet.Client) // NOT buffered 1811 var wg sync.WaitGroup 1812 wg.Add(n) 1813 for i := 0; i < n; i++ { 1814 go func(i int) { 1815 defer wg.Done() 1816 sp := lg.CreateSpan("get_helper", fmt.Sprintf("helper %d/%d", i+1, n)) 1817 schedItem := *schedTmpl // copy; GetBuildlet takes ownership 1818 schedItem.IsHelper = i > 0 1819 bc, err := sched.GetBuildlet(ctx, &schedItem) 1820 sp.Done(err) 1821 if err != nil { 1822 if err != context.Canceled { 1823 log.Printf("failed to get a %s buildlet: %v", schedItem.HostType, err) 1824 } 1825 return 1826 } 1827 lg.LogEventTime("empty_helper_ready", bc.Name()) 1828 select { 1829 case ch <- bc: 1830 case <-ctx.Done(): 1831 lg.LogEventTime("helper_killed_before_use", bc.Name()) 1832 bc.Close() 1833 return 1834 } 1835 }(i) 1836 } 1837 go func() { 1838 wg.Wait() 1839 close(ch) 1840 }() 1841 return ch 1842 } 1843 1844 type testSet struct { 1845 st *buildStatus 1846 items []*testItem 1847 testStats *buildstats.TestStats 1848 1849 mu sync.Mutex 1850 inOrder [][]*testItem 1851 biggestFirst [][]*testItem 1852 } 1853 1854 // cancelAll cancels all pending tests. 1855 func (s *testSet) cancelAll() { 1856 for _, ti := range s.items { 1857 ti.tryTake() // ignore return value 1858 } 1859 } 1860 1861 func (s *testSet) testsToRunInOrder() (chunk []*testItem, ok bool) { 1862 s.mu.Lock() 1863 defer s.mu.Unlock() 1864 if s.inOrder == nil { 1865 s.initInOrder() 1866 } 1867 return s.testsFromSlice(s.inOrder) 1868 } 1869 1870 func (s *testSet) testsToRunBiggestFirst() (chunk []*testItem, ok bool) { 1871 s.mu.Lock() 1872 defer s.mu.Unlock() 1873 if s.biggestFirst == nil { 1874 s.initBiggestFirst() 1875 } 1876 return s.testsFromSlice(s.biggestFirst) 1877 } 1878 1879 func (s *testSet) testsFromSlice(chunkList [][]*testItem) (chunk []*testItem, ok bool) { 1880 for _, candChunk := range chunkList { 1881 for _, ti := range candChunk { 1882 if ti.tryTake() { 1883 chunk = append(chunk, ti) 1884 } 1885 } 1886 if len(chunk) > 0 { 1887 return chunk, true 1888 } 1889 } 1890 return nil, false 1891 } 1892 1893 func (s *testSet) initInOrder() { 1894 names := make([]string, len(s.items)) 1895 namedItem := map[string]*testItem{} 1896 for i, ti := range s.items { 1897 names[i] = ti.name.Old 1898 namedItem[ti.name.Old] = ti 1899 } 1900 1901 // First do the go_test:* ones. partitionGoTests 1902 // only returns those, which are the ones we merge together. 1903 stdSets := partitionGoTests(s.testStats.Duration, s.st.BuilderRev.Name, names) 1904 for _, set := range stdSets { 1905 tis := make([]*testItem, len(set)) 1906 for i, name := range set { 1907 tis[i] = namedItem[name] 1908 } 1909 s.inOrder = append(s.inOrder, tis) 1910 } 1911 1912 // Then do the misc tests, which are always by themselves. 1913 // (No benefit to merging them) 1914 for _, ti := range s.items { 1915 if !strings.HasPrefix(ti.name.Old, "go_test:") { 1916 s.inOrder = append(s.inOrder, []*testItem{ti}) 1917 } 1918 } 1919 } 1920 1921 func partitionGoTests(testDuration func(string, string) time.Duration, builderName string, tests []string) (sets [][]string) { 1922 var srcTests []string 1923 var cmdTests []string 1924 for _, name := range tests { 1925 if strings.HasPrefix(name, "go_test:cmd/") { 1926 cmdTests = append(cmdTests, name) 1927 } else if strings.HasPrefix(name, "go_test:") { 1928 srcTests = append(srcTests, name) 1929 } 1930 } 1931 sort.Strings(srcTests) 1932 sort.Strings(cmdTests) 1933 goTests := append(srcTests, cmdTests...) 1934 1935 const sizeThres = 10 * time.Second 1936 1937 var curSet []string 1938 var curDur time.Duration 1939 1940 flush := func() { 1941 if len(curSet) > 0 { 1942 sets = append(sets, curSet) 1943 curSet = nil 1944 curDur = 0 1945 } 1946 } 1947 for _, testName := range goTests { 1948 d := testDuration(builderName, testName) 1949 if curDur+d > sizeThres { 1950 flush() // no-op if empty 1951 } 1952 curSet = append(curSet, testName) 1953 curDur += d 1954 } 1955 1956 flush() 1957 return 1958 } 1959 1960 func (s *testSet) initBiggestFirst() { 1961 items := append([]*testItem(nil), s.items...) 1962 sort.Sort(sort.Reverse(byTestDuration(items))) 1963 for _, item := range items { 1964 s.biggestFirst = append(s.biggestFirst, []*testItem{item}) 1965 } 1966 } 1967 1968 type testItem struct { 1969 set *testSet 1970 name distTestName 1971 duration time.Duration // optional approximate size 1972 1973 take chan token // buffered size 1: sending takes ownership of rest of fields: 1974 1975 done chan token // closed when done; guards output & failed 1976 numFail int // how many times it's failed to execute 1977 1978 // groupSize is the number of tests which were run together 1979 // along with this one with "go dist test". 1980 // This is 1 for non-std/cmd tests, and usually >1 for std/cmd tests. 1981 groupSize int 1982 shardIPPort string // buildlet's IPPort, for debugging 1983 1984 // the following are only set for the first item in a group: 1985 output []byte 1986 remoteErr error // real test failure (not a communications failure) 1987 execDuration time.Duration // actual time 1988 } 1989 1990 func (ti *testItem) tryTake() bool { 1991 select { 1992 case ti.take <- token{}: 1993 return true 1994 default: 1995 return false 1996 } 1997 } 1998 1999 // retry reschedules the test to run again, if a machine died before 2000 // or during execution, so its results aren't yet known. 2001 // The caller must own the 'take' semaphore. 2002 func (ti *testItem) retry() { 2003 // release it to make it available for somebody else to try later: 2004 <-ti.take 2005 } 2006 2007 func (ti *testItem) failf(format string, args ...interface{}) { 2008 msg := fmt.Sprintf(format, args...) 2009 ti.output = []byte(msg) 2010 ti.remoteErr = errors.New(msg) 2011 close(ti.done) 2012 } 2013 2014 // distTestName is the name of a dist test as discovered from 'go tool dist test -list'. 2015 type distTestName struct { 2016 Old string // Old is dist test name converted to Go 1.20 format, like "go_test:sort" or "reboot". 2017 Raw string // Raw is unmodified name from dist, suitable as an argument back to 'go tool dist test'. 2018 } 2019 2020 type byTestDuration []*testItem 2021 2022 func (s byTestDuration) Len() int { return len(s) } 2023 func (s byTestDuration) Less(i, j int) bool { return s[i].duration < s[j].duration } 2024 func (s byTestDuration) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 2025 2026 type eventAndTime struct { 2027 t time.Time 2028 evt string // "get_source", "make_and_test", "make", etc 2029 text string // optional detail text 2030 } 2031 2032 var nl = []byte("\n") 2033 2034 // getRepoHead returns the commit hash of the latest master HEAD 2035 // for the given repo ("go", "tools", "sys", etc). 2036 func getRepoHead(repo string) (string, error) { 2037 // This gRPC call should only take a couple milliseconds, but set some timeout 2038 // to catch network problems. 5 seconds is overkill. 2039 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 2040 defer cancel() 2041 res, err := maintnerClient.GetRef(ctx, &apipb.GetRefRequest{ 2042 GerritServer: "go.googlesource.com", 2043 GerritProject: repo, 2044 Ref: "refs/heads/master", 2045 }) 2046 if err != nil { 2047 return "", fmt.Errorf("looking up ref for %q: %v", repo, err) 2048 } 2049 if res.Value == "" { 2050 return "", fmt.Errorf("no master ref found for %q", repo) 2051 } 2052 return res.Value, nil 2053 } 2054 2055 // newBuildLogBlob creates a new object to record a public build log. 2056 // The objName should be a Google Cloud Storage object name. 2057 // When developing on localhost, the WriteCloser may be of a different type. 2058 func newBuildLogBlob(objName string) (obj io.WriteCloser, url_ string) { 2059 if *mode == "dev" { 2060 // TODO(bradfitz): write to disk or something, or 2061 // something testable. Maybe memory. 2062 return struct { 2063 io.Writer 2064 io.Closer 2065 }{ 2066 os.Stderr, 2067 io.NopCloser(nil), 2068 }, "devmode://build-log/" + objName 2069 } 2070 if pool.NewGCEConfiguration().StorageClient() == nil { 2071 panic("nil storageClient in newFailureBlob") 2072 } 2073 bucket := pool.NewGCEConfiguration().BuildEnv().LogBucket 2074 2075 wr := pool.NewGCEConfiguration().StorageClient().Bucket(bucket).Object(objName).NewWriter(context.Background()) 2076 wr.ContentType = "text/plain; charset=utf-8" 2077 2078 return wr, fmt.Sprintf("https://storage.googleapis.com/%s/%s", bucket, objName) 2079 } 2080 2081 func randHex(n int) string { 2082 buf := make([]byte, n/2+1) 2083 if _, err := rand.Read(buf); err != nil { 2084 log.Fatalf("randHex: %v", err) 2085 } 2086 return fmt.Sprintf("%x", buf)[:n] 2087 } 2088 2089 // importPathOfRepo returns the Go import path corresponding to the 2090 // root of the given non-"go" repo (Gerrit project). Because it's a Go 2091 // import path, it always has forward slashes and no trailing slash. 2092 // 2093 // For example: 2094 // 2095 // "net" -> "golang.org/x/net" 2096 // "crypto" -> "golang.org/x/crypto" 2097 // "dl" -> "golang.org/dl" 2098 func importPathOfRepo(repo string) string { 2099 r := repos.ByGerritProject[repo] 2100 if r == nil { 2101 // mayBuildRev prevents adding work for repos we don't know about, 2102 // so this shouldn't happen. If it does, a panic will be useful. 2103 panic(fmt.Sprintf("importPathOfRepo(%q) on unknown repo %q", repo, repo)) 2104 } 2105 if r.ImportPath == "" { 2106 // Likewise. This shouldn't happen. 2107 panic(fmt.Sprintf("importPathOfRepo(%q) doesn't have an ImportPath", repo)) 2108 } 2109 return r.ImportPath 2110 } 2111 2112 // slowBotsFromComments looks at the Gerrit comments in work, 2113 // and returns all build configurations that were explicitly 2114 // requested to be tested as SlowBots via the TRY= syntax. It 2115 // also returns any build terms that are not a valid builder 2116 // or alias. 2117 func slowBotsFromComments(work *apipb.GerritTryWorkItem) (builders []*dashboard.BuildConfig, invalidTryTerms []string) { 2118 tryTerms := latestTryTerms(work) 2119 invalidTryTerms = slices.Clone(tryTerms) 2120 for _, bc := range dashboard.Builders { 2121 for _, term := range tryTerms { 2122 if bc.MatchesSlowBotTerm(term) { 2123 invalidTryTerms = slices.DeleteFunc(invalidTryTerms, func(e string) bool { 2124 return e == term 2125 }) 2126 builders = append(builders, bc) 2127 break 2128 } 2129 } 2130 } 2131 sort.Slice(builders, func(i, j int) bool { 2132 return builders[i].Name < builders[j].Name 2133 }) 2134 return builders, invalidTryTerms 2135 } 2136 2137 type xRepoAndBuilder struct { 2138 Project string // "net", "tools", etc. 2139 Builder string // Builder to use. Empty string means default builder. 2140 } 2141 2142 func (rb xRepoAndBuilder) String() string { 2143 if rb.Builder == "" { 2144 return rb.Project 2145 } 2146 return rb.Project + "@" + rb.Builder 2147 } 2148 2149 // xReposFromComments looks at the TRY= comments from Gerrit (in work) and 2150 // returns any additional subrepos that should be tested. The TRY= comments 2151 // are expected to be of the format TRY=x/foo or TRY=x/foo@builder where foo is 2152 // the name of the subrepo and builder is a builder name. If no builder is 2153 // provided, a default builder is used. 2154 func xReposFromComments(work *apipb.GerritTryWorkItem) map[xRepoAndBuilder]bool { 2155 xrepos := make(map[xRepoAndBuilder]bool) 2156 for _, term := range latestTryTerms(work) { 2157 if len(term) < len("x/_") || term[:2] != "x/" { 2158 continue 2159 } 2160 parts := strings.SplitN(term, "@", 2) 2161 xrepo := parts[0][2:] 2162 builder := "" // By convention, this means the default builder. 2163 if len(parts) > 1 { 2164 builder = parts[1] 2165 } 2166 xrepos[xRepoAndBuilder{ 2167 Project: xrepo, 2168 Builder: builder, 2169 }] = true 2170 } 2171 return xrepos 2172 } 2173 2174 // latestTryTerms returns the terms that follow the TRY= syntax in Gerrit comments. 2175 func latestTryTerms(work *apipb.GerritTryWorkItem) []string { 2176 tryMsg := latestTryMessage(work) // "aix, darwin, linux-386-387, arm64, x/tools" 2177 if tryMsg == "" { 2178 return nil 2179 } 2180 if len(tryMsg) > 1<<10 { // arbitrary sanity 2181 return nil 2182 } 2183 return strings.FieldsFunc(tryMsg, func(c rune) bool { 2184 return !unicode.IsLetter(c) && !unicode.IsNumber(c) && c != '-' && c != '_' && c != '/' && c != '@' 2185 }) 2186 } 2187 2188 func latestTryMessage(work *apipb.GerritTryWorkItem) string { 2189 // Prioritize exact version matches first 2190 for i := len(work.TryMessage) - 1; i >= 0; i-- { 2191 m := work.TryMessage[i] 2192 if m.Version == work.Version { 2193 return m.Message 2194 } 2195 } 2196 // Otherwise the latest message at all 2197 for i := len(work.TryMessage) - 1; i >= 0; i-- { 2198 m := work.TryMessage[i] 2199 if m.Message != "" { 2200 return m.Message 2201 } 2202 } 2203 return "" 2204 } 2205 2206 // handlePostSubmitActiveJSON serves JSON with the info for which builds 2207 // are currently building. The build.golang.org dashboard renders these as little 2208 // blue gophers that link to the each build's status. 2209 // TODO: this a transitional step on our way towards merging build.golang.org into 2210 // this codebase; see https://github.com/golang/go/issues/34744#issuecomment-563398753. 2211 func handlePostSubmitActiveJSON(w http.ResponseWriter, r *http.Request) { 2212 w.Header().Set("Content-Type", "application/json") 2213 json.NewEncoder(w).Encode(activePostSubmitBuilds()) 2214 } 2215 2216 func activePostSubmitBuilds() []types.ActivePostSubmitBuild { 2217 var ret []types.ActivePostSubmitBuild 2218 statusMu.Lock() 2219 defer statusMu.Unlock() 2220 for _, st := range status { 2221 if st.isTry() || !st.HasBuildlet() { 2222 continue 2223 } 2224 st.mu.Lock() 2225 logsURL := st.logsURLLocked() 2226 st.mu.Unlock() 2227 2228 var commit, goCommit string 2229 if st.IsSubrepo() { 2230 commit, goCommit = st.SubRev, st.Rev 2231 } else { 2232 commit = st.Rev 2233 } 2234 ret = append(ret, types.ActivePostSubmitBuild{ 2235 StatusURL: logsURL, 2236 Builder: st.Name, 2237 Commit: commit, 2238 GoCommit: goCommit, 2239 }) 2240 } 2241 return ret 2242 } 2243 2244 func mustCreateSecretClientOnGCE() *secret.Client { 2245 if !metadata.OnGCE() { 2246 return nil 2247 } 2248 return secret.MustNewClient() 2249 } 2250 2251 func mustCreateEC2BuildletPool(sc *secret.Client, isRemoteBuildlet func(instName string) bool) *pool.EC2Buildlet { 2252 awsKeyID, err := sc.Retrieve(context.Background(), secret.NameAWSKeyID) 2253 if err != nil { 2254 log.Fatalf("unable to retrieve secret %q: %s", secret.NameAWSKeyID, err) 2255 } 2256 2257 awsAccessKey, err := sc.Retrieve(context.Background(), secret.NameAWSAccessKey) 2258 if err != nil { 2259 log.Fatalf("unable to retrieve secret %q: %s", secret.NameAWSAccessKey, err) 2260 } 2261 2262 awsClient, err := cloud.NewAWSClient(buildenv.Production.AWSRegion, awsKeyID, awsAccessKey, cloud.WithRateLimiter(cloud.DefaultEC2LimitConfig)) 2263 if err != nil { 2264 log.Fatalf("unable to create AWS client: %s", err) 2265 } 2266 2267 ec2Pool, err := pool.NewEC2Buildlet(awsClient, buildenv.Production, dashboard.Hosts, isRemoteBuildlet) 2268 if err != nil { 2269 log.Fatalf("unable to create EC2 buildlet pool: %s", err) 2270 } 2271 return ec2Pool 2272 } 2273 2274 func mustRetrieveSSHCertificateAuthority() (privateKey []byte) { 2275 privateKey, _, err := remote.SSHKeyPair() 2276 if err != nil { 2277 log.Fatalf("unable to create SSH CA cert: %s", err) 2278 } 2279 return 2280 } 2281 2282 func mustStorageClient() *storage.Client { 2283 if metadata.OnGCE() { 2284 return pool.NewGCEConfiguration().StorageClient() 2285 } 2286 storageClient, err := storage.NewClient(context.Background(), option.WithoutAuthentication()) 2287 if err != nil { 2288 log.Fatalf("unable to create storage client: %s", err) 2289 } 2290 return storageClient 2291 } 2292 2293 func fromSecret(ctx context.Context, sc *secret.Client, secretName string) (string, error) { 2294 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 2295 defer cancel() 2296 return sc.Retrieve(ctx, secretName) 2297 } 2298 2299 func retrieveSSHKeys(ctx context.Context, sc *secret.Client, m string) (publicKey, privateKey []byte, err error) { 2300 if m == "dev" { 2301 return remote.SSHKeyPair() 2302 } else if metadata.OnGCE() { 2303 privateKeyS, err := fromSecret(ctx, sc, secret.NameGomoteSSHPrivateKey) 2304 if err != nil { 2305 return nil, nil, err 2306 } 2307 publicKeyS, err := fromSecret(ctx, sc, secret.NameGomoteSSHPublicKey) 2308 if err != nil { 2309 return nil, nil, err 2310 } 2311 return []byte(privateKeyS), []byte(publicKeyS), nil 2312 } 2313 return nil, nil, fmt.Errorf("unable to retrieve ssh keys") 2314 }