golang.org/x/build@v0.0.0-20240506185731-218518f32b70/internal/coordinator/pool/reverse.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build linux || darwin
     6  
     7  package pool
     8  
     9  /*
    10  This file implements reverse buildlets. These are buildlets that are not
    11  started by the coordinator. They dial the coordinator and then accept
    12  instructions. This feature is used for machines that cannot be started by
    13  an API, for example real OS X machines with iOS and Android devices attached.
    14  
    15  You can test this setup locally. In one terminal start a coordinator.
    16  It will default to dev mode, using a dummy TLS cert and not talking to GCE.
    17  
    18  	$ coordinator
    19  
    20  In another terminal, start a reverse buildlet:
    21  
    22  	$ buildlet -reverse "darwin-amd64"
    23  
    24  It will dial and register itself with the coordinator. To confirm the
    25  coordinator can see the buildlet, check the logs output or visit its
    26  diagnostics page: https://localhost:8119. To send the buildlet some
    27  work, go to:
    28  
    29  	https://localhost:8119/dosomework
    30  */
    31  
    32  import (
    33  	"bytes"
    34  	"context"
    35  	"crypto/hmac"
    36  	"crypto/md5"
    37  	"errors"
    38  	"fmt"
    39  	"io"
    40  	"log"
    41  	"math/rand"
    42  	"net"
    43  	"net/http"
    44  	"sort"
    45  	"sync"
    46  	"time"
    47  
    48  	"golang.org/x/build/buildlet"
    49  	"golang.org/x/build/dashboard"
    50  	"golang.org/x/build/internal/coordinator/pool/queue"
    51  	"golang.org/x/build/revdial/v2"
    52  )
    53  
    54  const minBuildletVersion = 23
    55  
    56  var (
    57  	reversePool = &ReverseBuildletPool{
    58  		hostLastGood: make(map[string]time.Time),
    59  		hostQueue:    make(map[string]*queue.Quota),
    60  	}
    61  
    62  	builderMasterKey []byte
    63  )
    64  
    65  // SetBuilderMasterKey sets the builder master key used
    66  // to generate keys used by the builders.
    67  func SetBuilderMasterKey(masterKey []byte) {
    68  	builderMasterKey = masterKey
    69  }
    70  
    71  // ReversePool retrieves the reverse buildlet pool.
    72  func ReversePool() *ReverseBuildletPool {
    73  	return reversePool
    74  }
    75  
    76  // ReverseBuildletPool manages the pool of reverse buildlet pools.
    77  type ReverseBuildletPool struct {
    78  	// mu guards all 5 fields below and also fields of
    79  	// *reverseBuildlet in buildlets
    80  	mu sync.Mutex
    81  
    82  	// buildlets are the currently connected buildlets.
    83  	// TODO: switch to a map[hostType][]buildlets or map of set.
    84  	buildlets []*reverseBuildlet
    85  
    86  	hostQueue map[string]*queue.Quota
    87  
    88  	// hostLastGood tracks when buildlets were last seen to be
    89  	// healthy. It's only used by the health reporting code (in
    90  	// status.go). The reason it's a map on ReverseBuildletPool
    91  	// rather than a field on each reverseBuildlet is because we
    92  	// also want to track the last known health time of buildlets
    93  	// that aren't currently connected.
    94  	//
    95  	// Each buildlet's health is recorded in the map twice, under
    96  	// two different keys: 1) its reported host name, and 2) its
    97  	// hostType + ":" + its reported host name. It's recorded both
    98  	// ways so the status code can check for both globally-unique
    99  	// hostnames that change host types (e.g. our Macs), as well
   100  	// as hostnames that aren't globally unique and are expected
   101  	// to be found with different hostTypes (e.g. our ppc64le
   102  	// machines as both POWER8 and POWER9 host types, but with the
   103  	// same names).
   104  	hostLastGood map[string]time.Time
   105  }
   106  
   107  // BuildletLastSeen gives the last time a buildlet was connected to the pool. If
   108  // the buildlet has not been seen a false is returned by the boolean.
   109  func (p *ReverseBuildletPool) BuildletLastSeen(host string) (time.Time, bool) {
   110  	p.mu.Lock()
   111  	defer p.mu.Unlock()
   112  
   113  	t, ok := p.hostLastGood[host]
   114  	return t, ok
   115  }
   116  
   117  // tryToGrab returns non-nil bc on success if a buildlet is free.
   118  //
   119  // Otherwise it returns how many were busy, which might be 0 if none
   120  // were (yet?) registered. The busy valid is only valid if bc == nil.
   121  func (p *ReverseBuildletPool) tryToGrab(hostType string) (bc buildlet.Client, busy int) {
   122  	p.mu.Lock()
   123  	defer p.mu.Unlock()
   124  	defer p.updateQuotasLocked()
   125  	for _, b := range p.buildlets {
   126  		if b.hostType != hostType {
   127  			continue
   128  		}
   129  		if b.inUse {
   130  			busy++
   131  			continue
   132  		}
   133  		// Found an unused match.
   134  		b.inUse = true
   135  		b.inUseTime = time.Now()
   136  		return b.client, 0
   137  	}
   138  	return nil, busy
   139  }
   140  
   141  // nukeBuildlet wipes out victim as a buildlet we'll ever return again,
   142  // and closes its TCP connection in hopes that it will fix itself
   143  // later.
   144  func (p *ReverseBuildletPool) nukeBuildlet(victim buildlet.Client) {
   145  	p.mu.Lock()
   146  	defer p.mu.Unlock()
   147  	defer p.updateQuotasLocked()
   148  	for i, rb := range p.buildlets {
   149  		if rb.client == victim {
   150  			defer rb.conn.Close()
   151  			p.buildlets = append(p.buildlets[:i], p.buildlets[i+1:]...)
   152  			return
   153  		}
   154  	}
   155  }
   156  
   157  // healthCheckBuildletLoop periodically requests the status from b.
   158  // If the buildlet fails to respond promptly, it is removed from the pool.
   159  func (p *ReverseBuildletPool) healthCheckBuildletLoop(b *reverseBuildlet) {
   160  	for {
   161  		time.Sleep(time.Duration(10+rand.Intn(5)) * time.Second)
   162  		if !p.healthCheckBuildlet(b) {
   163  			return
   164  		}
   165  	}
   166  }
   167  
   168  // recordHealthy updates the two map entries in hostLastGood recording
   169  // that b is healthy.
   170  func (p *ReverseBuildletPool) recordHealthy(b *reverseBuildlet) {
   171  	t := time.Now()
   172  	p.hostLastGood[b.hostname] = t
   173  	p.hostLastGood[b.hostType+":"+b.hostname] = t
   174  }
   175  
   176  func (p *ReverseBuildletPool) healthCheckBuildlet(b *reverseBuildlet) bool {
   177  	defer p.updateQuotas()
   178  	if b.client.IsBroken() {
   179  		return false
   180  	}
   181  	p.mu.Lock()
   182  	if b.inHealthCheck { // sanity check
   183  		panic("previous health check still running")
   184  	}
   185  	if b.inUse {
   186  		p.recordHealthy(b)
   187  		p.mu.Unlock()
   188  		return true // skip busy buildlets
   189  	}
   190  	b.inUse = true
   191  	b.inHealthCheck = true
   192  	b.inUseTime = time.Now()
   193  	res := make(chan error, 1)
   194  	go func() {
   195  		_, err := b.client.Status(context.Background())
   196  		res <- err
   197  	}()
   198  	p.mu.Unlock()
   199  
   200  	t := time.NewTimer(20 * time.Second) // give buildlets time to respond
   201  	var err error
   202  	select {
   203  	case err = <-res:
   204  		t.Stop()
   205  	case <-t.C:
   206  		err = errors.New("health check timeout")
   207  	}
   208  
   209  	if err != nil {
   210  		// remove bad buildlet
   211  		log.Printf("Health check fail; removing reverse buildlet %v (type %v): %v", b.hostname, b.hostType, err)
   212  		go b.client.Close()
   213  		go p.nukeBuildlet(b.client)
   214  		return false
   215  	}
   216  
   217  	p.mu.Lock()
   218  	defer p.mu.Unlock()
   219  
   220  	if !b.inHealthCheck {
   221  		// buildlet was grabbed while lock was released; harmless.
   222  		return true
   223  	}
   224  	b.inUse = false
   225  	b.inHealthCheck = false
   226  	b.inUseTime = time.Now()
   227  	p.recordHealthy(b)
   228  	return true
   229  }
   230  
   231  func (p *ReverseBuildletPool) hostTypeQueue(hostType string) *queue.Quota {
   232  	if p.hostQueue[hostType] == nil {
   233  		queue := queue.NewQuota()
   234  		p.hostQueue[hostType] = queue
   235  	}
   236  	return p.hostQueue[hostType]
   237  }
   238  
   239  // GetBuildlet builds a buildlet client for the passed in host.
   240  func (p *ReverseBuildletPool) GetBuildlet(ctx context.Context, hostType string, lg Logger, si *queue.SchedItem) (buildlet.Client, error) {
   241  	sp := lg.CreateSpan("wait_static_builder", hostType)
   242  	// No need to return quota when done. The quotas will be updated
   243  	// when the reverse buildlet reconnects and becomes healthy.
   244  	err := p.hostTypeQueue(hostType).AwaitQueue(ctx, 1, si)
   245  	sp.Done(err)
   246  	if err != nil {
   247  		return nil, err
   248  	}
   249  
   250  	seenErrInUse := false
   251  	for {
   252  		bc, busy := p.tryToGrab(hostType)
   253  		if bc != nil {
   254  			sp.Done(nil)
   255  			return p.cleanedBuildlet(bc, lg)
   256  		}
   257  		if busy > 0 && !seenErrInUse {
   258  			lg.LogEventTime("waiting_machine_in_use")
   259  			seenErrInUse = true
   260  		}
   261  		select {
   262  		case <-ctx.Done():
   263  			return nil, sp.Done(ctx.Err())
   264  		case <-time.After(10 * time.Second):
   265  		}
   266  	}
   267  }
   268  
   269  func (p *ReverseBuildletPool) cleanedBuildlet(b buildlet.Client, lg Logger) (buildlet.Client, error) {
   270  	// Clean up any files from previous builds.
   271  	sp := lg.CreateSpan("clean_buildlet", b.String())
   272  	err := b.RemoveAll(context.Background(), ".")
   273  	sp.Done(err)
   274  	if err != nil {
   275  		b.Close()
   276  		return nil, err
   277  	}
   278  	return b, nil
   279  }
   280  
   281  // WriteHTMLStatus writes a status of the reverse buildlet pool, in HTML format,
   282  // to the passed in io.Writer.
   283  func (p *ReverseBuildletPool) WriteHTMLStatus(w io.Writer) {
   284  	// total maps from a host type to the number of machines which are
   285  	// capable of that role.
   286  	total := make(map[string]int)
   287  	for typ, host := range dashboard.Hosts {
   288  		if host.ExpectNum > 0 {
   289  			total[typ] = 0
   290  		}
   291  	}
   292  	// inUse track the number of non-idle host types.
   293  	inUse := make(map[string]int)
   294  
   295  	var buf bytes.Buffer
   296  	p.mu.Lock()
   297  	buildlets := append([]*reverseBuildlet(nil), p.buildlets...)
   298  	sort.Sort(byTypeThenHostname(buildlets))
   299  	numInUse := 0
   300  	for _, b := range buildlets {
   301  		machStatus := "<i>idle</i>"
   302  		if b.inUse {
   303  			machStatus = "working"
   304  			numInUse++
   305  		}
   306  		fmt.Fprintf(&buf, "<li>%s (%s) version %s, %s: connected %s, %s for %s</li>\n",
   307  			b.hostname,
   308  			b.conn.RemoteAddr(),
   309  			b.version,
   310  			b.hostType,
   311  			friendlyDuration(time.Since(b.regTime)),
   312  			machStatus,
   313  			friendlyDuration(time.Since(b.inUseTime)))
   314  		total[b.hostType]++
   315  		if b.inUse && !b.inHealthCheck {
   316  
   317  			inUse[b.hostType]++
   318  		}
   319  	}
   320  	numConnected := len(buildlets)
   321  	p.mu.Unlock()
   322  
   323  	var typs []string
   324  	for typ := range total {
   325  		typs = append(typs, typ)
   326  	}
   327  	sort.Strings(typs)
   328  
   329  	io.WriteString(w, "<b>Reverse pool stats</b><ul>\n")
   330  	fmt.Fprintf(w, "<li>Buildlets connected: %d</li>\n", numConnected)
   331  	fmt.Fprintf(w, "<li>Buildlets in use: %d</li>\n", numInUse)
   332  	io.WriteString(w, "</ul>")
   333  
   334  	io.WriteString(w, "<b>Reverse pool by host type</b> (in use / total)<ul>\n")
   335  	if len(typs) == 0 {
   336  		io.WriteString(w, "<li>no connections</li>\n")
   337  	}
   338  	for _, typ := range typs {
   339  		if dashboard.Hosts[typ] != nil && total[typ] < dashboard.Hosts[typ].ExpectNum {
   340  			fmt.Fprintf(w, "<li>%s: %d/%d (%d missing)</li>\n",
   341  				typ, inUse[typ], total[typ], dashboard.Hosts[typ].ExpectNum-total[typ])
   342  		} else {
   343  			fmt.Fprintf(w, "<li>%s: %d/%d</li>\n", typ, inUse[typ], total[typ])
   344  		}
   345  	}
   346  	io.WriteString(w, "</ul>\n")
   347  
   348  	fmt.Fprintf(w, "<b>Reverse pool machine detail</b><ul>%s</ul>", buf.Bytes())
   349  }
   350  
   351  func (p *ReverseBuildletPool) QuotaStats() map[string]*queue.QuotaStats {
   352  	p.mu.Lock()
   353  	defer p.mu.Unlock()
   354  	ret := make(map[string]*queue.QuotaStats)
   355  	for typ, queue := range p.hostQueue {
   356  		ret[fmt.Sprintf("reverse-%s", typ)] = queue.ToExported()
   357  	}
   358  	return ret
   359  }
   360  
   361  // HostTypeCount iterates through the running reverse buildlets, and
   362  // constructs a count of running buildlets per hostType.
   363  func (p *ReverseBuildletPool) HostTypeCount() map[string]int {
   364  	total := map[string]int{}
   365  	p.mu.Lock()
   366  	for _, b := range p.buildlets {
   367  		total[b.hostType]++
   368  	}
   369  	p.mu.Unlock()
   370  	return total
   371  }
   372  
   373  // SingleHostTypeCount iterates through the running reverse buildlets, and
   374  // constructs a count of the running buildlet hostType requested.
   375  func (p *ReverseBuildletPool) SingleHostTypeCount(hostType string) int {
   376  	p.mu.Lock()
   377  	defer p.mu.Unlock()
   378  	n := 0
   379  	for _, b := range p.buildlets {
   380  		if b.hostType == hostType {
   381  			n++
   382  		}
   383  	}
   384  	return n
   385  }
   386  
   387  func (p *ReverseBuildletPool) String() string {
   388  	// This doesn't currently show up anywhere, so ignore it for now.
   389  	return "TODO: some reverse buildlet summary"
   390  }
   391  
   392  // HostTypes returns a sorted, deduplicated list of buildlet types
   393  // currently supported by the pool.
   394  func (p *ReverseBuildletPool) HostTypes() (types []string) {
   395  	totals := p.HostTypeCount()
   396  	for t := range totals {
   397  		types = append(types, t)
   398  	}
   399  	sort.Strings(types)
   400  	return types
   401  }
   402  
   403  // CanBuild reports whether the pool has a machine capable of building mode,
   404  // even if said machine isn't currently idle.
   405  func (p *ReverseBuildletPool) CanBuild(hostType string) bool {
   406  	p.mu.Lock()
   407  	defer p.mu.Unlock()
   408  	for _, b := range p.buildlets {
   409  		if b.hostType == hostType {
   410  			return true
   411  		}
   412  	}
   413  	return false
   414  }
   415  
   416  func (p *ReverseBuildletPool) updateQuotas() {
   417  	p.mu.Lock()
   418  	defer p.mu.Unlock()
   419  	p.updateQuotasLocked()
   420  }
   421  
   422  func (p *ReverseBuildletPool) updateQuotasLocked() {
   423  	limits := make(map[string]int)
   424  	used := make(map[string]int)
   425  	for _, b := range p.buildlets {
   426  		limits[b.hostType] += 1
   427  		if b.inUse {
   428  			used[b.hostType] += 1
   429  		}
   430  	}
   431  	for hostType, limit := range limits {
   432  		q := p.hostTypeQueue(hostType)
   433  		q.UpdateQuotas(used[hostType], limit)
   434  	}
   435  }
   436  
   437  func (p *ReverseBuildletPool) addBuildlet(b *reverseBuildlet) {
   438  	p.mu.Lock()
   439  	defer p.updateQuotas()
   440  	defer p.mu.Unlock()
   441  	p.buildlets = append(p.buildlets, b)
   442  	p.recordHealthy(b)
   443  	go p.healthCheckBuildletLoop(b)
   444  }
   445  
   446  // BuildletHostnames returns a slice of reverse buildlet hostnames.
   447  func (p *ReverseBuildletPool) BuildletHostnames() []string {
   448  	p.mu.Lock()
   449  	defer p.mu.Unlock()
   450  
   451  	h := make([]string, 0, len(p.buildlets))
   452  	for _, b := range p.buildlets {
   453  		h = append(h, b.hostname)
   454  	}
   455  	return h
   456  }
   457  
   458  // reverseBuildlet is a registered reverse buildlet.
   459  // Its immediate fields are guarded by the ReverseBuildletPool mutex.
   460  type reverseBuildlet struct {
   461  	// hostname is the name of the buildlet host.
   462  	// It doesn't have to be a complete DNS name.
   463  	hostname string
   464  	// version is the reverse buildlet's version.
   465  	version string
   466  
   467  	// sessRand is the unique random number for every unique buildlet session.
   468  	sessRand string
   469  
   470  	client  buildlet.Client
   471  	conn    net.Conn
   472  	regTime time.Time // when it was first connected
   473  
   474  	// hostType is the configuration of this machine.
   475  	// It is the key into the dashboard.Hosts map.
   476  	hostType string
   477  
   478  	// inUseAs signifies that the buildlet is in use.
   479  	// inUseTime is when it entered that state.
   480  	// inHealthCheck is whether it's inUse due to a health check.
   481  	// All three are guarded by the mutex on ReverseBuildletPool.
   482  	inUse         bool
   483  	inUseTime     time.Time
   484  	inHealthCheck bool
   485  }
   486  
   487  // HandleReverse handles reverse buildlet connections.
   488  func HandleReverse(w http.ResponseWriter, r *http.Request) {
   489  	if r.TLS == nil {
   490  		http.Error(w, "buildlet registration requires SSL", http.StatusInternalServerError)
   491  		return
   492  	}
   493  
   494  	var (
   495  		hostType        = r.Header.Get("X-Go-Host-Type")
   496  		buildKey        = r.Header.Get("X-Go-Builder-Key")
   497  		buildletVersion = r.Header.Get("X-Go-Builder-Version")
   498  		hostname        = r.Header.Get("X-Go-Builder-Hostname")
   499  	)
   500  
   501  	switch r.Header.Get("X-Revdial-Version") {
   502  	case "":
   503  		// Old.
   504  		http.Error(w, "buildlet binary is too old", http.StatusBadRequest)
   505  		return
   506  	case "2":
   507  		// Current.
   508  	default:
   509  		http.Error(w, "unknown revdial version", http.StatusBadRequest)
   510  		return
   511  	}
   512  
   513  	if hostname == "" {
   514  		http.Error(w, "missing X-Go-Builder-Hostname header", http.StatusBadRequest)
   515  		return
   516  	}
   517  
   518  	// Check build keys.
   519  	if hostType == "" {
   520  		http.Error(w, "missing X-Go-Host-Type; old buildlet binary?", http.StatusBadRequest)
   521  		return
   522  	}
   523  	if buildKey != builderKey(hostType) {
   524  		http.Error(w, "invalid build key", http.StatusPreconditionFailed)
   525  		return
   526  	}
   527  
   528  	conn, _, err := w.(http.Hijacker).Hijack()
   529  	if err != nil {
   530  		http.Error(w, err.Error(), http.StatusInternalServerError)
   531  		return
   532  	}
   533  
   534  	if err := (&http.Response{StatusCode: http.StatusSwitchingProtocols, Proto: "HTTP/1.1"}).Write(conn); err != nil {
   535  		log.Printf("error writing upgrade response to reverse buildlet %s (%s) at %s: %v", hostname, hostType, r.RemoteAddr, err)
   536  		conn.Close()
   537  		return
   538  	}
   539  
   540  	log.Printf("Registering reverse buildlet %q (%s) for host type %v; buildletVersion=%v",
   541  		hostname, r.RemoteAddr, hostType, buildletVersion)
   542  
   543  	revDialer := revdial.NewDialer(conn, "/revdial")
   544  	revDialerDone := revDialer.Done()
   545  	dialer := revDialer.Dial
   546  
   547  	client := buildlet.NewClient(hostname, buildlet.NoKeyPair)
   548  	client.SetHTTPClient(&http.Client{
   549  		Transport: &http.Transport{
   550  			DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
   551  				return dialer(ctx)
   552  			},
   553  		},
   554  	})
   555  	client.SetDialer(dialer)
   556  	client.SetDescription(fmt.Sprintf("reverse peer %s/%s for host type %v", hostname, r.RemoteAddr, hostType))
   557  
   558  	var isDead struct {
   559  		sync.Mutex
   560  		v bool
   561  	}
   562  	client.SetOnHeartbeatFailure(func() {
   563  		isDead.Lock()
   564  		isDead.v = true
   565  		isDead.Unlock()
   566  		conn.Close()
   567  		reversePool.nukeBuildlet(client)
   568  	})
   569  
   570  	// If the reverse dialer (which is always reading from the
   571  	// conn) detects that the remote went away, close the buildlet
   572  	// client proactively show
   573  	go func() {
   574  		<-revDialerDone
   575  		isDead.Lock()
   576  		defer isDead.Unlock()
   577  		if !isDead.v {
   578  			client.Close()
   579  		}
   580  	}()
   581  	tstatus := time.Now()
   582  	status, err := client.Status(context.Background())
   583  	if err != nil {
   584  		log.Printf("Reverse connection %s/%s for %s did not answer status after %v: %v",
   585  			hostname, r.RemoteAddr, hostType, time.Since(tstatus), err)
   586  		conn.Close()
   587  		return
   588  	}
   589  	if status.Version < minBuildletVersion {
   590  		log.Printf("Buildlet too old (need version %d or newer): %s, %+v", minBuildletVersion, r.RemoteAddr, status)
   591  		conn.Close()
   592  		return
   593  	}
   594  	log.Printf("Buildlet %s/%s: %+v for %s", hostname, r.RemoteAddr, status, hostType)
   595  
   596  	now := time.Now()
   597  	b := &reverseBuildlet{
   598  		hostname:  hostname,
   599  		version:   buildletVersion,
   600  		hostType:  hostType,
   601  		client:    client,
   602  		conn:      conn,
   603  		inUseTime: now,
   604  		regTime:   now,
   605  	}
   606  	reversePool.addBuildlet(b)
   607  }
   608  
   609  type byTypeThenHostname []*reverseBuildlet
   610  
   611  func (s byTypeThenHostname) Len() int      { return len(s) }
   612  func (s byTypeThenHostname) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
   613  func (s byTypeThenHostname) Less(i, j int) bool {
   614  	bi, bj := s[i], s[j]
   615  	ti, tj := bi.hostType, bj.hostType
   616  	if ti == tj {
   617  		return bi.hostname < bj.hostname
   618  	}
   619  	return ti < tj
   620  }
   621  
   622  // builderKey generates the builder key used by reverse builders
   623  // to authenticate with the coordinator.
   624  func builderKey(builder string) string {
   625  	if len(builderMasterKey) == 0 {
   626  		return ""
   627  	}
   628  	h := hmac.New(md5.New, builderMasterKey)
   629  	io.WriteString(h, builder)
   630  	return fmt.Sprintf("%x", h.Sum(nil))
   631  }