golang.org/x/build@v0.0.0-20240506185731-218518f32b70/internal/coordinator/pool/reverse.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build linux || darwin 6 7 package pool 8 9 /* 10 This file implements reverse buildlets. These are buildlets that are not 11 started by the coordinator. They dial the coordinator and then accept 12 instructions. This feature is used for machines that cannot be started by 13 an API, for example real OS X machines with iOS and Android devices attached. 14 15 You can test this setup locally. In one terminal start a coordinator. 16 It will default to dev mode, using a dummy TLS cert and not talking to GCE. 17 18 $ coordinator 19 20 In another terminal, start a reverse buildlet: 21 22 $ buildlet -reverse "darwin-amd64" 23 24 It will dial and register itself with the coordinator. To confirm the 25 coordinator can see the buildlet, check the logs output or visit its 26 diagnostics page: https://localhost:8119. To send the buildlet some 27 work, go to: 28 29 https://localhost:8119/dosomework 30 */ 31 32 import ( 33 "bytes" 34 "context" 35 "crypto/hmac" 36 "crypto/md5" 37 "errors" 38 "fmt" 39 "io" 40 "log" 41 "math/rand" 42 "net" 43 "net/http" 44 "sort" 45 "sync" 46 "time" 47 48 "golang.org/x/build/buildlet" 49 "golang.org/x/build/dashboard" 50 "golang.org/x/build/internal/coordinator/pool/queue" 51 "golang.org/x/build/revdial/v2" 52 ) 53 54 const minBuildletVersion = 23 55 56 var ( 57 reversePool = &ReverseBuildletPool{ 58 hostLastGood: make(map[string]time.Time), 59 hostQueue: make(map[string]*queue.Quota), 60 } 61 62 builderMasterKey []byte 63 ) 64 65 // SetBuilderMasterKey sets the builder master key used 66 // to generate keys used by the builders. 67 func SetBuilderMasterKey(masterKey []byte) { 68 builderMasterKey = masterKey 69 } 70 71 // ReversePool retrieves the reverse buildlet pool. 72 func ReversePool() *ReverseBuildletPool { 73 return reversePool 74 } 75 76 // ReverseBuildletPool manages the pool of reverse buildlet pools. 77 type ReverseBuildletPool struct { 78 // mu guards all 5 fields below and also fields of 79 // *reverseBuildlet in buildlets 80 mu sync.Mutex 81 82 // buildlets are the currently connected buildlets. 83 // TODO: switch to a map[hostType][]buildlets or map of set. 84 buildlets []*reverseBuildlet 85 86 hostQueue map[string]*queue.Quota 87 88 // hostLastGood tracks when buildlets were last seen to be 89 // healthy. It's only used by the health reporting code (in 90 // status.go). The reason it's a map on ReverseBuildletPool 91 // rather than a field on each reverseBuildlet is because we 92 // also want to track the last known health time of buildlets 93 // that aren't currently connected. 94 // 95 // Each buildlet's health is recorded in the map twice, under 96 // two different keys: 1) its reported host name, and 2) its 97 // hostType + ":" + its reported host name. It's recorded both 98 // ways so the status code can check for both globally-unique 99 // hostnames that change host types (e.g. our Macs), as well 100 // as hostnames that aren't globally unique and are expected 101 // to be found with different hostTypes (e.g. our ppc64le 102 // machines as both POWER8 and POWER9 host types, but with the 103 // same names). 104 hostLastGood map[string]time.Time 105 } 106 107 // BuildletLastSeen gives the last time a buildlet was connected to the pool. If 108 // the buildlet has not been seen a false is returned by the boolean. 109 func (p *ReverseBuildletPool) BuildletLastSeen(host string) (time.Time, bool) { 110 p.mu.Lock() 111 defer p.mu.Unlock() 112 113 t, ok := p.hostLastGood[host] 114 return t, ok 115 } 116 117 // tryToGrab returns non-nil bc on success if a buildlet is free. 118 // 119 // Otherwise it returns how many were busy, which might be 0 if none 120 // were (yet?) registered. The busy valid is only valid if bc == nil. 121 func (p *ReverseBuildletPool) tryToGrab(hostType string) (bc buildlet.Client, busy int) { 122 p.mu.Lock() 123 defer p.mu.Unlock() 124 defer p.updateQuotasLocked() 125 for _, b := range p.buildlets { 126 if b.hostType != hostType { 127 continue 128 } 129 if b.inUse { 130 busy++ 131 continue 132 } 133 // Found an unused match. 134 b.inUse = true 135 b.inUseTime = time.Now() 136 return b.client, 0 137 } 138 return nil, busy 139 } 140 141 // nukeBuildlet wipes out victim as a buildlet we'll ever return again, 142 // and closes its TCP connection in hopes that it will fix itself 143 // later. 144 func (p *ReverseBuildletPool) nukeBuildlet(victim buildlet.Client) { 145 p.mu.Lock() 146 defer p.mu.Unlock() 147 defer p.updateQuotasLocked() 148 for i, rb := range p.buildlets { 149 if rb.client == victim { 150 defer rb.conn.Close() 151 p.buildlets = append(p.buildlets[:i], p.buildlets[i+1:]...) 152 return 153 } 154 } 155 } 156 157 // healthCheckBuildletLoop periodically requests the status from b. 158 // If the buildlet fails to respond promptly, it is removed from the pool. 159 func (p *ReverseBuildletPool) healthCheckBuildletLoop(b *reverseBuildlet) { 160 for { 161 time.Sleep(time.Duration(10+rand.Intn(5)) * time.Second) 162 if !p.healthCheckBuildlet(b) { 163 return 164 } 165 } 166 } 167 168 // recordHealthy updates the two map entries in hostLastGood recording 169 // that b is healthy. 170 func (p *ReverseBuildletPool) recordHealthy(b *reverseBuildlet) { 171 t := time.Now() 172 p.hostLastGood[b.hostname] = t 173 p.hostLastGood[b.hostType+":"+b.hostname] = t 174 } 175 176 func (p *ReverseBuildletPool) healthCheckBuildlet(b *reverseBuildlet) bool { 177 defer p.updateQuotas() 178 if b.client.IsBroken() { 179 return false 180 } 181 p.mu.Lock() 182 if b.inHealthCheck { // sanity check 183 panic("previous health check still running") 184 } 185 if b.inUse { 186 p.recordHealthy(b) 187 p.mu.Unlock() 188 return true // skip busy buildlets 189 } 190 b.inUse = true 191 b.inHealthCheck = true 192 b.inUseTime = time.Now() 193 res := make(chan error, 1) 194 go func() { 195 _, err := b.client.Status(context.Background()) 196 res <- err 197 }() 198 p.mu.Unlock() 199 200 t := time.NewTimer(20 * time.Second) // give buildlets time to respond 201 var err error 202 select { 203 case err = <-res: 204 t.Stop() 205 case <-t.C: 206 err = errors.New("health check timeout") 207 } 208 209 if err != nil { 210 // remove bad buildlet 211 log.Printf("Health check fail; removing reverse buildlet %v (type %v): %v", b.hostname, b.hostType, err) 212 go b.client.Close() 213 go p.nukeBuildlet(b.client) 214 return false 215 } 216 217 p.mu.Lock() 218 defer p.mu.Unlock() 219 220 if !b.inHealthCheck { 221 // buildlet was grabbed while lock was released; harmless. 222 return true 223 } 224 b.inUse = false 225 b.inHealthCheck = false 226 b.inUseTime = time.Now() 227 p.recordHealthy(b) 228 return true 229 } 230 231 func (p *ReverseBuildletPool) hostTypeQueue(hostType string) *queue.Quota { 232 if p.hostQueue[hostType] == nil { 233 queue := queue.NewQuota() 234 p.hostQueue[hostType] = queue 235 } 236 return p.hostQueue[hostType] 237 } 238 239 // GetBuildlet builds a buildlet client for the passed in host. 240 func (p *ReverseBuildletPool) GetBuildlet(ctx context.Context, hostType string, lg Logger, si *queue.SchedItem) (buildlet.Client, error) { 241 sp := lg.CreateSpan("wait_static_builder", hostType) 242 // No need to return quota when done. The quotas will be updated 243 // when the reverse buildlet reconnects and becomes healthy. 244 err := p.hostTypeQueue(hostType).AwaitQueue(ctx, 1, si) 245 sp.Done(err) 246 if err != nil { 247 return nil, err 248 } 249 250 seenErrInUse := false 251 for { 252 bc, busy := p.tryToGrab(hostType) 253 if bc != nil { 254 sp.Done(nil) 255 return p.cleanedBuildlet(bc, lg) 256 } 257 if busy > 0 && !seenErrInUse { 258 lg.LogEventTime("waiting_machine_in_use") 259 seenErrInUse = true 260 } 261 select { 262 case <-ctx.Done(): 263 return nil, sp.Done(ctx.Err()) 264 case <-time.After(10 * time.Second): 265 } 266 } 267 } 268 269 func (p *ReverseBuildletPool) cleanedBuildlet(b buildlet.Client, lg Logger) (buildlet.Client, error) { 270 // Clean up any files from previous builds. 271 sp := lg.CreateSpan("clean_buildlet", b.String()) 272 err := b.RemoveAll(context.Background(), ".") 273 sp.Done(err) 274 if err != nil { 275 b.Close() 276 return nil, err 277 } 278 return b, nil 279 } 280 281 // WriteHTMLStatus writes a status of the reverse buildlet pool, in HTML format, 282 // to the passed in io.Writer. 283 func (p *ReverseBuildletPool) WriteHTMLStatus(w io.Writer) { 284 // total maps from a host type to the number of machines which are 285 // capable of that role. 286 total := make(map[string]int) 287 for typ, host := range dashboard.Hosts { 288 if host.ExpectNum > 0 { 289 total[typ] = 0 290 } 291 } 292 // inUse track the number of non-idle host types. 293 inUse := make(map[string]int) 294 295 var buf bytes.Buffer 296 p.mu.Lock() 297 buildlets := append([]*reverseBuildlet(nil), p.buildlets...) 298 sort.Sort(byTypeThenHostname(buildlets)) 299 numInUse := 0 300 for _, b := range buildlets { 301 machStatus := "<i>idle</i>" 302 if b.inUse { 303 machStatus = "working" 304 numInUse++ 305 } 306 fmt.Fprintf(&buf, "<li>%s (%s) version %s, %s: connected %s, %s for %s</li>\n", 307 b.hostname, 308 b.conn.RemoteAddr(), 309 b.version, 310 b.hostType, 311 friendlyDuration(time.Since(b.regTime)), 312 machStatus, 313 friendlyDuration(time.Since(b.inUseTime))) 314 total[b.hostType]++ 315 if b.inUse && !b.inHealthCheck { 316 317 inUse[b.hostType]++ 318 } 319 } 320 numConnected := len(buildlets) 321 p.mu.Unlock() 322 323 var typs []string 324 for typ := range total { 325 typs = append(typs, typ) 326 } 327 sort.Strings(typs) 328 329 io.WriteString(w, "<b>Reverse pool stats</b><ul>\n") 330 fmt.Fprintf(w, "<li>Buildlets connected: %d</li>\n", numConnected) 331 fmt.Fprintf(w, "<li>Buildlets in use: %d</li>\n", numInUse) 332 io.WriteString(w, "</ul>") 333 334 io.WriteString(w, "<b>Reverse pool by host type</b> (in use / total)<ul>\n") 335 if len(typs) == 0 { 336 io.WriteString(w, "<li>no connections</li>\n") 337 } 338 for _, typ := range typs { 339 if dashboard.Hosts[typ] != nil && total[typ] < dashboard.Hosts[typ].ExpectNum { 340 fmt.Fprintf(w, "<li>%s: %d/%d (%d missing)</li>\n", 341 typ, inUse[typ], total[typ], dashboard.Hosts[typ].ExpectNum-total[typ]) 342 } else { 343 fmt.Fprintf(w, "<li>%s: %d/%d</li>\n", typ, inUse[typ], total[typ]) 344 } 345 } 346 io.WriteString(w, "</ul>\n") 347 348 fmt.Fprintf(w, "<b>Reverse pool machine detail</b><ul>%s</ul>", buf.Bytes()) 349 } 350 351 func (p *ReverseBuildletPool) QuotaStats() map[string]*queue.QuotaStats { 352 p.mu.Lock() 353 defer p.mu.Unlock() 354 ret := make(map[string]*queue.QuotaStats) 355 for typ, queue := range p.hostQueue { 356 ret[fmt.Sprintf("reverse-%s", typ)] = queue.ToExported() 357 } 358 return ret 359 } 360 361 // HostTypeCount iterates through the running reverse buildlets, and 362 // constructs a count of running buildlets per hostType. 363 func (p *ReverseBuildletPool) HostTypeCount() map[string]int { 364 total := map[string]int{} 365 p.mu.Lock() 366 for _, b := range p.buildlets { 367 total[b.hostType]++ 368 } 369 p.mu.Unlock() 370 return total 371 } 372 373 // SingleHostTypeCount iterates through the running reverse buildlets, and 374 // constructs a count of the running buildlet hostType requested. 375 func (p *ReverseBuildletPool) SingleHostTypeCount(hostType string) int { 376 p.mu.Lock() 377 defer p.mu.Unlock() 378 n := 0 379 for _, b := range p.buildlets { 380 if b.hostType == hostType { 381 n++ 382 } 383 } 384 return n 385 } 386 387 func (p *ReverseBuildletPool) String() string { 388 // This doesn't currently show up anywhere, so ignore it for now. 389 return "TODO: some reverse buildlet summary" 390 } 391 392 // HostTypes returns a sorted, deduplicated list of buildlet types 393 // currently supported by the pool. 394 func (p *ReverseBuildletPool) HostTypes() (types []string) { 395 totals := p.HostTypeCount() 396 for t := range totals { 397 types = append(types, t) 398 } 399 sort.Strings(types) 400 return types 401 } 402 403 // CanBuild reports whether the pool has a machine capable of building mode, 404 // even if said machine isn't currently idle. 405 func (p *ReverseBuildletPool) CanBuild(hostType string) bool { 406 p.mu.Lock() 407 defer p.mu.Unlock() 408 for _, b := range p.buildlets { 409 if b.hostType == hostType { 410 return true 411 } 412 } 413 return false 414 } 415 416 func (p *ReverseBuildletPool) updateQuotas() { 417 p.mu.Lock() 418 defer p.mu.Unlock() 419 p.updateQuotasLocked() 420 } 421 422 func (p *ReverseBuildletPool) updateQuotasLocked() { 423 limits := make(map[string]int) 424 used := make(map[string]int) 425 for _, b := range p.buildlets { 426 limits[b.hostType] += 1 427 if b.inUse { 428 used[b.hostType] += 1 429 } 430 } 431 for hostType, limit := range limits { 432 q := p.hostTypeQueue(hostType) 433 q.UpdateQuotas(used[hostType], limit) 434 } 435 } 436 437 func (p *ReverseBuildletPool) addBuildlet(b *reverseBuildlet) { 438 p.mu.Lock() 439 defer p.updateQuotas() 440 defer p.mu.Unlock() 441 p.buildlets = append(p.buildlets, b) 442 p.recordHealthy(b) 443 go p.healthCheckBuildletLoop(b) 444 } 445 446 // BuildletHostnames returns a slice of reverse buildlet hostnames. 447 func (p *ReverseBuildletPool) BuildletHostnames() []string { 448 p.mu.Lock() 449 defer p.mu.Unlock() 450 451 h := make([]string, 0, len(p.buildlets)) 452 for _, b := range p.buildlets { 453 h = append(h, b.hostname) 454 } 455 return h 456 } 457 458 // reverseBuildlet is a registered reverse buildlet. 459 // Its immediate fields are guarded by the ReverseBuildletPool mutex. 460 type reverseBuildlet struct { 461 // hostname is the name of the buildlet host. 462 // It doesn't have to be a complete DNS name. 463 hostname string 464 // version is the reverse buildlet's version. 465 version string 466 467 // sessRand is the unique random number for every unique buildlet session. 468 sessRand string 469 470 client buildlet.Client 471 conn net.Conn 472 regTime time.Time // when it was first connected 473 474 // hostType is the configuration of this machine. 475 // It is the key into the dashboard.Hosts map. 476 hostType string 477 478 // inUseAs signifies that the buildlet is in use. 479 // inUseTime is when it entered that state. 480 // inHealthCheck is whether it's inUse due to a health check. 481 // All three are guarded by the mutex on ReverseBuildletPool. 482 inUse bool 483 inUseTime time.Time 484 inHealthCheck bool 485 } 486 487 // HandleReverse handles reverse buildlet connections. 488 func HandleReverse(w http.ResponseWriter, r *http.Request) { 489 if r.TLS == nil { 490 http.Error(w, "buildlet registration requires SSL", http.StatusInternalServerError) 491 return 492 } 493 494 var ( 495 hostType = r.Header.Get("X-Go-Host-Type") 496 buildKey = r.Header.Get("X-Go-Builder-Key") 497 buildletVersion = r.Header.Get("X-Go-Builder-Version") 498 hostname = r.Header.Get("X-Go-Builder-Hostname") 499 ) 500 501 switch r.Header.Get("X-Revdial-Version") { 502 case "": 503 // Old. 504 http.Error(w, "buildlet binary is too old", http.StatusBadRequest) 505 return 506 case "2": 507 // Current. 508 default: 509 http.Error(w, "unknown revdial version", http.StatusBadRequest) 510 return 511 } 512 513 if hostname == "" { 514 http.Error(w, "missing X-Go-Builder-Hostname header", http.StatusBadRequest) 515 return 516 } 517 518 // Check build keys. 519 if hostType == "" { 520 http.Error(w, "missing X-Go-Host-Type; old buildlet binary?", http.StatusBadRequest) 521 return 522 } 523 if buildKey != builderKey(hostType) { 524 http.Error(w, "invalid build key", http.StatusPreconditionFailed) 525 return 526 } 527 528 conn, _, err := w.(http.Hijacker).Hijack() 529 if err != nil { 530 http.Error(w, err.Error(), http.StatusInternalServerError) 531 return 532 } 533 534 if err := (&http.Response{StatusCode: http.StatusSwitchingProtocols, Proto: "HTTP/1.1"}).Write(conn); err != nil { 535 log.Printf("error writing upgrade response to reverse buildlet %s (%s) at %s: %v", hostname, hostType, r.RemoteAddr, err) 536 conn.Close() 537 return 538 } 539 540 log.Printf("Registering reverse buildlet %q (%s) for host type %v; buildletVersion=%v", 541 hostname, r.RemoteAddr, hostType, buildletVersion) 542 543 revDialer := revdial.NewDialer(conn, "/revdial") 544 revDialerDone := revDialer.Done() 545 dialer := revDialer.Dial 546 547 client := buildlet.NewClient(hostname, buildlet.NoKeyPair) 548 client.SetHTTPClient(&http.Client{ 549 Transport: &http.Transport{ 550 DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) { 551 return dialer(ctx) 552 }, 553 }, 554 }) 555 client.SetDialer(dialer) 556 client.SetDescription(fmt.Sprintf("reverse peer %s/%s for host type %v", hostname, r.RemoteAddr, hostType)) 557 558 var isDead struct { 559 sync.Mutex 560 v bool 561 } 562 client.SetOnHeartbeatFailure(func() { 563 isDead.Lock() 564 isDead.v = true 565 isDead.Unlock() 566 conn.Close() 567 reversePool.nukeBuildlet(client) 568 }) 569 570 // If the reverse dialer (which is always reading from the 571 // conn) detects that the remote went away, close the buildlet 572 // client proactively show 573 go func() { 574 <-revDialerDone 575 isDead.Lock() 576 defer isDead.Unlock() 577 if !isDead.v { 578 client.Close() 579 } 580 }() 581 tstatus := time.Now() 582 status, err := client.Status(context.Background()) 583 if err != nil { 584 log.Printf("Reverse connection %s/%s for %s did not answer status after %v: %v", 585 hostname, r.RemoteAddr, hostType, time.Since(tstatus), err) 586 conn.Close() 587 return 588 } 589 if status.Version < minBuildletVersion { 590 log.Printf("Buildlet too old (need version %d or newer): %s, %+v", minBuildletVersion, r.RemoteAddr, status) 591 conn.Close() 592 return 593 } 594 log.Printf("Buildlet %s/%s: %+v for %s", hostname, r.RemoteAddr, status, hostType) 595 596 now := time.Now() 597 b := &reverseBuildlet{ 598 hostname: hostname, 599 version: buildletVersion, 600 hostType: hostType, 601 client: client, 602 conn: conn, 603 inUseTime: now, 604 regTime: now, 605 } 606 reversePool.addBuildlet(b) 607 } 608 609 type byTypeThenHostname []*reverseBuildlet 610 611 func (s byTypeThenHostname) Len() int { return len(s) } 612 func (s byTypeThenHostname) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 613 func (s byTypeThenHostname) Less(i, j int) bool { 614 bi, bj := s[i], s[j] 615 ti, tj := bi.hostType, bj.hostType 616 if ti == tj { 617 return bi.hostname < bj.hostname 618 } 619 return ti < tj 620 } 621 622 // builderKey generates the builder key used by reverse builders 623 // to authenticate with the coordinator. 624 func builderKey(builder string) string { 625 if len(builderMasterKey) == 0 { 626 return "" 627 } 628 h := hmac.New(md5.New, builderMasterKey) 629 io.WriteString(h, builder) 630 return fmt.Sprintf("%x", h.Sum(nil)) 631 }