golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/coordinator/status.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build linux || darwin 6 7 package main 8 9 import ( 10 "bufio" 11 "bytes" 12 "context" 13 _ "embed" 14 "errors" 15 "fmt" 16 "html" 17 "html/template" 18 "io" 19 "log" 20 "net/http" 21 "os" 22 "os/exec" 23 "regexp" 24 "runtime" 25 "sort" 26 "strings" 27 "sync" 28 "sync/atomic" 29 "time" 30 31 "github.com/google/go-github/github" 32 "go.opencensus.io/stats" 33 "golang.org/x/build/dashboard" 34 "golang.org/x/build/internal/coordinator/pool" 35 "golang.org/x/build/internal/coordinator/remote" 36 "golang.org/x/build/internal/coordinator/schedule" 37 "golang.org/x/build/internal/secret" 38 "golang.org/x/build/kubernetes" 39 "golang.org/x/build/kubernetes/api" 40 "golang.org/x/oauth2" 41 "google.golang.org/grpc" 42 ) 43 44 // status 45 type statusLevel int 46 47 const ( 48 // levelInfo is an informational text that's not an error, 49 // such as "coordinator just started recently, waiting to 50 // start health check" 51 levelInfo statusLevel = iota 52 // levelWarn is a non-critical error, such as "missing 1 of 50 53 // of ARM machines" 54 levelWarn 55 // levelError is something that should be fixed sooner, such 56 // as "all Macs are gone". 57 levelError 58 ) 59 60 func (l statusLevel) String() string { 61 switch l { 62 case levelInfo: 63 return "Info" 64 case levelWarn: 65 return "Warn" 66 case levelError: 67 return "Error" 68 } 69 return "" 70 } 71 72 type levelText struct { 73 Level statusLevel 74 Text string 75 } 76 77 func (lt levelText) AsHTML() template.HTML { 78 switch lt.Level { 79 case levelInfo: 80 return template.HTML(html.EscapeString(lt.Text)) 81 case levelWarn: 82 return template.HTML(fmt.Sprintf("<span style='color: orange'>%s</span>", html.EscapeString(lt.Text))) 83 case levelError: 84 return template.HTML(fmt.Sprintf("<span style='color: red'><b>%s</b></span>", html.EscapeString(lt.Text))) 85 } 86 return "" 87 } 88 89 type checkWriter struct { 90 Out []levelText 91 } 92 93 func (w *checkWriter) error(s string) { w.Out = append(w.Out, levelText{levelError, s}) } 94 func (w *checkWriter) errorf(a string, args ...interface{}) { w.error(fmt.Sprintf(a, args...)) } 95 func (w *checkWriter) info(s string) { w.Out = append(w.Out, levelText{levelInfo, s}) } 96 func (w *checkWriter) infof(a string, args ...interface{}) { w.info(fmt.Sprintf(a, args...)) } 97 func (w *checkWriter) warn(s string) { w.Out = append(w.Out, levelText{levelWarn, s}) } 98 func (w *checkWriter) warnf(a string, args ...interface{}) { w.warn(fmt.Sprintf(a, args...)) } 99 func (w *checkWriter) hasErrors() bool { 100 for _, v := range w.Out { 101 if v.Level == levelError { 102 return true 103 } 104 } 105 return false 106 } 107 108 type healthChecker struct { 109 ID string 110 Title string 111 DocURL string 112 113 // Check writes the health check status to a checkWriter. 114 // 115 // It's called when rendering the HTML page, so expensive 116 // operations (network calls, etc.) should be done in a 117 // separate goroutine and Check should report their results. 118 Check func(*checkWriter) 119 } 120 121 func (hc *healthChecker) DoCheck() *checkWriter { 122 cw := new(checkWriter) 123 hc.Check(cw) 124 return cw 125 } 126 127 var ( 128 healthCheckers []*healthChecker 129 healthCheckerByID = map[string]*healthChecker{} 130 ) 131 132 func addHealthChecker(mux *http.ServeMux, hc *healthChecker) { 133 if _, dup := healthCheckerByID[hc.ID]; dup { 134 panic("duplicate health checker ID " + hc.ID) 135 } 136 healthCheckers = append(healthCheckers, hc) 137 healthCheckerByID[hc.ID] = hc 138 mux.Handle("/status/"+hc.ID, healthCheckerHandler(hc)) 139 } 140 141 // basePinErr is the status of the start-up time basepin disk creation 142 // in gce.go. It's of type string; no value means no result yet, 143 // empty string means success, and non-empty means an error. 144 var basePinErr atomic.Value 145 146 func addHealthCheckers(ctx context.Context, mux *http.ServeMux, sc *secret.Client) { 147 addHealthChecker(mux, newMacOSARM64Checker()) 148 addHealthChecker(mux, newBasepinChecker()) 149 addHealthChecker(mux, newGitMirrorChecker()) 150 addHealthChecker(mux, newGitHubAPIChecker(ctx, sc)) 151 } 152 153 func newBasepinChecker() *healthChecker { 154 return &healthChecker{ 155 ID: "basepin", 156 Title: "VM snapshots", 157 DocURL: "https://golang.org/issue/21305", 158 Check: func(w *checkWriter) { 159 v := basePinErr.Load() 160 if v == nil { 161 w.warnf("still running") 162 return 163 } 164 if v == "" { 165 return 166 } 167 w.error(v.(string)) 168 }, 169 } 170 } 171 172 // gitMirrorStatus is the latest known status of the gitmirror service. 173 var gitMirrorStatus = struct { 174 sync.Mutex 175 Errors []string 176 Warnings []string 177 }{Warnings: []string{"still checking"}} 178 179 func monitorGitMirror(kcl *kubernetes.Client) { 180 for { 181 errs, warns := gitMirrorErrors(kcl) 182 gitMirrorStatus.Lock() 183 gitMirrorStatus.Errors, gitMirrorStatus.Warnings = errs, warns 184 gitMirrorStatus.Unlock() 185 time.Sleep(30 * time.Second) 186 } 187 } 188 189 // gitMirrorErrors queries the status pages of all 190 // running gitmirror instances and reports errors. 191 func gitMirrorErrors(kcl *kubernetes.Client) (errs, warns []string) { 192 ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) 193 defer cancel() 194 pods, err := kcl.GetPods(ctx) 195 if err != nil { 196 log.Println("gitMirrorErrors: goKubeClient.GetPods:", err) 197 return []string{"failed to get pods; can't query gitmirror status"}, nil 198 } 199 var runningGitMirror []api.Pod 200 for _, p := range pods { 201 if !strings.HasPrefix(p.Labels["app"], "gitmirror") || p.Status.Phase != "Running" { 202 continue 203 } 204 runningGitMirror = append(runningGitMirror, p) 205 } 206 if len(runningGitMirror) == 0 { 207 return []string{"no running gitmirror instances"}, nil 208 } 209 for _, pod := range runningGitMirror { 210 // The gitmirror -http=:8585 status page URL is hardcoded here. 211 // If the ReplicationController configuration changes (rare), this 212 // health check will begin to fail until it's updated accordingly. 213 instErrs, instWarns := gitMirrorInstanceErrors(ctx, fmt.Sprintf("http://%s:8585/", pod.Status.PodIP)) 214 for _, err := range instErrs { 215 errs = append(errs, fmt.Sprintf("instance %s: %s", pod.Name, err)) 216 } 217 for _, warn := range instWarns { 218 warns = append(warns, fmt.Sprintf("instance %s: %s", pod.Name, warn)) 219 } 220 } 221 return errs, warns 222 } 223 224 func gitMirrorInstanceErrors(ctx context.Context, url string) (errs, warns []string) { 225 req, _ := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) 226 res, err := http.DefaultClient.Do(req) 227 if err != nil { 228 return []string{err.Error()}, nil 229 } 230 defer res.Body.Close() 231 if res.StatusCode != 200 { 232 return []string{res.Status}, nil 233 } 234 // TODO: add a JSON mode to gitmirror so we don't need to parse HTML. 235 // This works for now. We control its output. 236 bs := bufio.NewScanner(res.Body) 237 for bs.Scan() { 238 // Lines look like: 239 // <html><body><pre><a href='/debug/watcher/arch'>arch</a> - ok 240 // or: 241 // <a href='/debug/watcher/arch'>arch</a> - ok 242 // (See https://farmer.golang.org/debug/watcher/) 243 line := bs.Text() 244 if strings.HasSuffix(line, " - ok") { 245 continue 246 } 247 m := gitMirrorLineRx.FindStringSubmatch(line) 248 if len(m) != 3 { 249 if strings.Contains(line, "</html>") { 250 break 251 } 252 return []string{fmt.Sprintf("error parsing line %q", line)}, nil 253 } 254 if strings.HasPrefix(m[2], "ok; ") { 255 // If the status begins with "ok", it can't be that bad. 256 warns = append(warns, fmt.Sprintf("repo %s: %s", m[1], m[2])) 257 continue 258 } 259 errs = append(errs, fmt.Sprintf("repo %s: %s", m[1], m[2])) 260 } 261 if err := bs.Err(); err != nil { 262 errs = append(errs, err.Error()) 263 } 264 return errs, warns 265 } 266 267 // $1 is repo; $2 is error message 268 var gitMirrorLineRx = regexp.MustCompile(`/debug/watcher/([\w-]+).?>.+</a> - (.*)`) 269 270 func newGitMirrorChecker() *healthChecker { 271 return &healthChecker{ 272 ID: "gitmirror", 273 Title: "Git mirroring", 274 DocURL: "https://github.com/golang/build/tree/master/cmd/gitmirror", 275 Check: func(w *checkWriter) { 276 gitMirrorStatus.Lock() 277 errs, warns := gitMirrorStatus.Errors, gitMirrorStatus.Warnings 278 gitMirrorStatus.Unlock() 279 for _, v := range errs { 280 w.error(v) 281 } 282 for _, v := range warns { 283 w.warn(v) 284 } 285 }, 286 } 287 } 288 289 func newMacOSARM64Checker() *healthChecker { 290 var expect int // Number of expected darwin/arm64 reverse builders based on x/build/dashboard. 291 for hostType, hc := range dashboard.Hosts { 292 if !strings.HasPrefix(hostType, "host-darwin-arm64-") || !hc.IsReverse { 293 continue 294 } 295 expect += hc.ExpectNum 296 } 297 var hosts []string 298 for i := 1; i <= expect; i++ { 299 hosts = append(hosts, fmt.Sprintf("fishbowl-%02d.local", i)) 300 } 301 return &healthChecker{ 302 ID: "macos-arm64", 303 Title: "macOS ARM64 (M1 Mac minis)", 304 DocURL: "https://golang.org/issue/39782", 305 Check: reverseHostChecker(hosts), 306 } 307 } 308 309 func expectedHosts(hostType string) int { 310 hc, ok := dashboard.Hosts[hostType] 311 if !ok { 312 panic(fmt.Sprintf("unknown host type %q", hostType)) 313 } 314 return hc.ExpectNum 315 } 316 317 func reverseHostChecker(hosts []string) func(cw *checkWriter) { 318 const recentThreshold = 2 * time.Minute // let VMs be away 2 minutes; assume ~1 minute bootup + slop 319 checkStart := time.Now().Add(recentThreshold) 320 321 hostSet := map[string]bool{} 322 for _, v := range hosts { 323 hostSet[v] = true 324 } 325 326 // TODO(amedee): rethink how this is implemented. It has been 327 // modified due to golang.org/issues/36841 328 // instead of a single lock being held while all of the 329 // operations are performed, there is now a lock held 330 // during each BuildletLastSeen call and again when 331 // the buildlet host names are retrieved. 332 return func(cw *checkWriter) { 333 p := pool.ReversePool() 334 335 now := time.Now() 336 wantGoodSince := now.Add(-recentThreshold) 337 numMissing := 0 338 numGood := 0 339 // Check last good times 340 for _, host := range hosts { 341 lastGood, ok := p.BuildletLastSeen(host) 342 if ok && lastGood.After(wantGoodSince) { 343 numGood++ 344 continue 345 } 346 if now.Before(checkStart) { 347 cw.infof("%s not yet connected", host) 348 continue 349 } 350 if ok { 351 cw.warnf("%s missing, not seen for %v", host, time.Now().Sub(lastGood).Round(time.Second)) 352 } else { 353 cw.warnf("%s missing, never seen (at least %v)", host, uptime()) 354 } 355 numMissing++ 356 } 357 if numMissing > 0 { 358 sum := numMissing + numGood 359 percentMissing := float64(numMissing) / float64(sum) 360 msg := fmt.Sprintf("%d machines missing, %.0f%% of capacity", numMissing, percentMissing*100) 361 if percentMissing >= 0.15 { 362 cw.error(msg) 363 } else { 364 cw.warn(msg) 365 } 366 } 367 368 // And check that we don't have more than 1 369 // connected of any type. 370 count := map[string]int{} 371 for _, hostname := range p.BuildletHostnames() { 372 if hostSet[hostname] { 373 count[hostname]++ 374 } 375 } 376 for name, n := range count { 377 if n > 1 { 378 cw.errorf("%q is connected from %v machines", name, n) 379 } 380 } 381 } 382 } 383 384 // newGitHubAPIChecker creates a GitHub API health checker 385 // that queries the remaining rate limit at regular invervals 386 // and reports when the hourly quota has been exceeded. 387 // 388 // It also records metrics to track remaining rate limit over time. 389 func newGitHubAPIChecker(ctx context.Context, sc *secret.Client) *healthChecker { 390 // githubRate is the status of the GitHub API v3 client. 391 // It's of type *github.Rate; no value means no result yet, 392 // nil value means no recent result. 393 var githubRate atomic.Value 394 395 hc := &healthChecker{ 396 ID: "githubapi", 397 Title: "GitHub API Rate Limit", 398 DocURL: "https://golang.org/issue/44406", 399 Check: func(w *checkWriter) { 400 rate, ok := githubRate.Load().(*github.Rate) 401 if !ok { 402 w.warn("still checking") 403 } else if rate == nil { 404 w.warn("no recent result") 405 } else if rate.Remaining == 0 { 406 resetIn := "a minute or so" 407 if t := time.Until(rate.Reset.Time); t > time.Minute { 408 resetIn = t.Round(time.Second).String() 409 } 410 w.warnf("hourly GitHub API rate limit exceeded; reset in %s", resetIn) 411 } 412 }, 413 } 414 415 // Start measuring and reporting the remaining GitHub API v3 rate limit. 416 if sc == nil { 417 hc.Check = func(w *checkWriter) { 418 w.info("check disabled; credentials were not provided") 419 } 420 return hc 421 } 422 token, err := sc.Retrieve(ctx, secret.NameMaintnerGitHubToken) 423 if err != nil { 424 log.Printf("newGitHubAPIChecker: sc.Retrieve(_, %q) failed, err = %v\n", secret.NameMaintnerGitHubToken, err) 425 hc.Check = func(w *checkWriter) { 426 // The check is displayed publicly, so don't include details from err. 427 w.error("failed to retrieve API token") 428 } 429 return hc 430 } 431 gh := github.NewClient(oauth2.NewClient(ctx, oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token}))) 432 go func() { 433 t := time.NewTicker(time.Minute) 434 defer t.Stop() 435 for { 436 // Fetch the current rate limit from the GitHub API. 437 // This endpoint is special in that it doesn't consume rate limit quota itself. 438 var rate *github.Rate 439 rateLimitsCtx, cancel := context.WithTimeout(ctx, 10*time.Second) 440 rl, _, err := gh.RateLimits(rateLimitsCtx) 441 cancel() 442 if rle := (*github.RateLimitError)(nil); errors.As(err, &rle) { 443 rate = &rle.Rate 444 } else if err != nil { 445 log.Println("GitHubAPIChecker: github.RateLimits:", err) 446 } else { 447 rate = rl.GetCore() 448 } 449 450 // Store the result of fetching, and record the current rate limit, if any. 451 githubRate.Store(rate) 452 if rate != nil { 453 stats.Record(ctx, mGitHubAPIRemaining.M(int64(rate.Remaining))) 454 } 455 456 select { 457 case <-t.C: 458 case <-ctx.Done(): 459 return 460 } 461 } 462 }() 463 464 return hc 465 } 466 467 func healthCheckerHandler(hc *healthChecker) http.Handler { 468 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 469 cw := new(checkWriter) 470 hc.Check(cw) 471 w.Header().Set("Content-Type", "text/plain; charset=utf-8") 472 if cw.hasErrors() { 473 w.WriteHeader(500) 474 } else { 475 w.WriteHeader(200) 476 } 477 if len(cw.Out) == 0 { 478 io.WriteString(w, "ok\n") 479 return 480 } 481 fmt.Fprintf(w, "# %q status: %s\n", hc.ID, hc.Title) 482 if hc.DocURL != "" { 483 fmt.Fprintf(w, "# Notes: %v\n", hc.DocURL) 484 } 485 for _, v := range cw.Out { 486 fmt.Fprintf(w, "%s: %s\n", v.Level, v.Text) 487 } 488 }) 489 } 490 491 func uptime() time.Duration { return time.Since(processStartTime).Round(time.Second) } 492 493 // grpcHandlerFunc creates handler which intercepts requests intended for a GRPC server and directs the calls to the server. 494 // All other requests are directed toward the passed in handler. 495 func grpcHandlerFunc(gs *grpc.Server, h http.HandlerFunc) http.HandlerFunc { 496 return func(w http.ResponseWriter, r *http.Request) { 497 if r.ProtoMajor == 2 && strings.HasPrefix(r.Header.Get("Content-Type"), "application/grpc") { 498 gs.ServeHTTP(w, r) 499 return 500 } 501 h(w, r) 502 } 503 } 504 505 func handleStatus(w http.ResponseWriter, r *http.Request) { 506 if r.URL.Path != "/" { 507 http.NotFound(w, r) 508 return 509 } 510 df := diskFree() 511 512 statusMu.Lock() 513 data := statusData{ 514 Total: len(status), 515 Uptime: uptime(), 516 Recent: append([]*buildStatus{}, statusDone...), 517 DiskFree: df, 518 Version: Version, 519 NumFD: fdCount(), 520 NumGoroutine: runtime.NumGoroutine(), 521 HealthCheckers: healthCheckers, 522 } 523 for _, st := range status { 524 if st.HasBuildlet() { 525 data.ActiveBuilds++ 526 data.Active = append(data.Active, st) 527 if st.conf.IsReverse() { 528 data.ActiveReverse++ 529 } 530 } else { 531 data.Pending = append(data.Pending, st) 532 } 533 } 534 // TODO: make this prettier. 535 var buf bytes.Buffer 536 for _, key := range tryList { 537 if ts := tries[key]; ts != nil { 538 state := ts.state() 539 fmt.Fprintf(&buf, "Change-ID: %v Commit: %v (<a href='/try?commit=%v'>status</a>)\n", 540 key.ChangeTriple(), key.Commit, key.Commit[:8]) 541 fmt.Fprintf(&buf, " Remain: %d, fails: %v\n", state.remain, state.failed) 542 for _, bs := range ts.builds { 543 fmt.Fprintf(&buf, " %s: running=%v\n", bs.Name, bs.isRunning()) 544 } 545 } 546 } 547 statusMu.Unlock() 548 549 gce := pool.NewGCEConfiguration() 550 data.GomoteInstances = remoteSessionStatus() 551 552 sort.Sort(byAge(data.Active)) 553 sort.Sort(byAge(data.Pending)) 554 sort.Sort(sort.Reverse(byAge(data.Recent))) 555 if gce.TryDepsErr() != nil { 556 data.TrybotsErr = gce.TryDepsErr().Error() 557 } else { 558 if buf.Len() == 0 { 559 data.Trybots = template.HTML("<i>(none)</i>") 560 } else { 561 data.Trybots = template.HTML("<pre>" + buf.String() + "</pre>") 562 } 563 } 564 565 buf.Reset() 566 gce.BuildletPool().WriteHTMLStatus(&buf) 567 data.GCEPoolStatus = template.HTML(buf.String()) 568 buf.Reset() 569 570 buf.Reset() 571 pool.EC2BuildetPool().WriteHTMLStatus(&buf) 572 data.EC2PoolStatus = template.HTML(buf.String()) 573 buf.Reset() 574 575 pool.ReversePool().WriteHTMLStatus(&buf) 576 data.ReversePoolStatus = template.HTML(buf.String()) 577 578 data.SchedState = sched.State() 579 580 buf.Reset() 581 if err := statusTmpl.Execute(&buf, data); err != nil { 582 http.Error(w, err.Error(), http.StatusInternalServerError) 583 return 584 } 585 buf.WriteTo(w) 586 } 587 588 func fdCount() int { 589 f, err := os.Open("/proc/self/fd") 590 if err != nil { 591 return -1 592 } 593 defer f.Close() 594 n := 0 595 for { 596 names, err := f.Readdirnames(1000) 597 n += len(names) 598 if err == io.EOF { 599 return n 600 } 601 if err != nil { 602 return -1 603 } 604 } 605 } 606 607 func friendlyDuration(d time.Duration) string { 608 if d > 10*time.Second { 609 d2 := ((d + 50*time.Millisecond) / (100 * time.Millisecond)) * (100 * time.Millisecond) 610 return d2.String() 611 } 612 if d > time.Second { 613 d2 := ((d + 5*time.Millisecond) / (10 * time.Millisecond)) * (10 * time.Millisecond) 614 return d2.String() 615 } 616 d2 := ((d + 50*time.Microsecond) / (100 * time.Microsecond)) * (100 * time.Microsecond) 617 return d2.String() 618 } 619 620 func diskFree() string { 621 out, _ := exec.Command("df", "-h").Output() 622 return string(out) 623 } 624 625 // statusData is the data that fills out statusTmpl. 626 type statusData struct { 627 Total int // number of total builds (including those waiting for a buildlet) 628 ActiveBuilds int // number of running builds (subset of Total with a buildlet) 629 ActiveReverse int // subset of ActiveBuilds that are reverse buildlets 630 NumFD int 631 NumGoroutine int 632 Uptime time.Duration 633 Active []*buildStatus // have a buildlet 634 Pending []*buildStatus // waiting on a buildlet 635 Recent []*buildStatus 636 TrybotsErr string 637 Trybots template.HTML 638 GCEPoolStatus template.HTML // TODO: embed template 639 EC2PoolStatus template.HTML // TODO: embed template 640 ReversePoolStatus template.HTML // TODO: embed template 641 GomoteInstances template.HTML 642 SchedState schedule.SchedulerState 643 DiskFree string 644 Version string 645 HealthCheckers []*healthChecker 646 } 647 648 //go:embed templates/base.html 649 var baseTmplStr string 650 651 // baseTmpl defines common templates for reuse in other coordinator templates. 652 var baseTmpl = template.Must(template.New("").Parse(baseTmplStr)) 653 654 //go:embed templates/status.html 655 var statusTmplStr string 656 657 var statusTmpl = template.Must(baseTmpl.New("status").Parse(statusTmplStr)) 658 659 //go:embed style.css 660 var styleCSS []byte 661 662 func handleStyleCSS(w http.ResponseWriter, r *http.Request) { 663 w.Header().Set("Cache-Control", "no-cache, private, max-age=0") 664 http.ServeContent(w, r, "style.css", processStartTime, bytes.NewReader(styleCSS)) 665 } 666 667 // statusSessionPool to be used exclusively in the status file. 668 var statusSessionPool *remote.SessionPool 669 670 // setSessionPool sets the session pool for use in the status file. 671 func setSessionPool(sp *remote.SessionPool) { 672 statusSessionPool = sp 673 } 674 675 // remoteSessionStatus creates the status HTML for the sessions in the session pool. 676 func remoteSessionStatus() template.HTML { 677 sessions := statusSessionPool.List() 678 if len(sessions) == 0 { 679 return "<i>(none)</i>" 680 } 681 var buf bytes.Buffer 682 buf.WriteString("<ul>") 683 for _, s := range sessions { 684 fmt.Fprintf(&buf, "<li><b>%s</b>, created %v ago, expires in %v</li>\n", 685 html.EscapeString(s.ID), 686 time.Since(s.Created), time.Until(s.Expires)) 687 } 688 buf.WriteString("</ul>") 689 return template.HTML(buf.String()) 690 }