golang.org/x/build@v0.0.0-20240506185731-218518f32b70/internal/coordinator/pool/gce.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build linux || darwin
     6  
     7  // Code interacting with Google Compute Engine (GCE) and
     8  // a GCE implementation of the BuildletPool interface.
     9  
    10  package pool
    11  
    12  import (
    13  	"context"
    14  	"encoding/json"
    15  	"errors"
    16  	"fmt"
    17  	"io"
    18  	"log"
    19  	"net/http"
    20  	"path"
    21  	"sort"
    22  	"strconv"
    23  	"strings"
    24  	"sync"
    25  	"sync/atomic"
    26  	"time"
    27  
    28  	"cloud.google.com/go/compute/metadata"
    29  	"cloud.google.com/go/datastore"
    30  	"cloud.google.com/go/errorreporting"
    31  	"cloud.google.com/go/storage"
    32  	"golang.org/x/build/buildenv"
    33  	"golang.org/x/build/buildlet"
    34  	"golang.org/x/build/dashboard"
    35  	"golang.org/x/build/gerrit"
    36  	"golang.org/x/build/internal/buildgo"
    37  	"golang.org/x/build/internal/buildstats"
    38  	"golang.org/x/build/internal/coordinator/pool/queue"
    39  	"golang.org/x/build/internal/lru"
    40  	"golang.org/x/build/internal/secret"
    41  	"golang.org/x/build/internal/spanlog"
    42  	"golang.org/x/oauth2"
    43  	"golang.org/x/oauth2/google"
    44  	"google.golang.org/api/compute/v1"
    45  	"google.golang.org/api/googleapi"
    46  )
    47  
    48  func init() {
    49  	buildlet.GCEGate = gceAPIGate
    50  }
    51  
    52  // apiCallTicker ticks regularly, preventing us from accidentally making
    53  // GCE API calls too quickly. Our quota is 20 QPS, but we temporarily
    54  // limit ourselves to less than that.
    55  var apiCallTicker = time.NewTicker(time.Second / 10)
    56  
    57  // Separate rate limit for deletions, which are more important than other
    58  // actions, especially at server startup.
    59  var deletionTicker = time.NewTicker(time.Second / 10)
    60  
    61  func gceAPIGate() {
    62  	<-apiCallTicker.C
    63  }
    64  
    65  func deletionAPIGate() {
    66  	<-deletionTicker.C
    67  }
    68  
    69  // Initialized by InitGCE:
    70  //
    71  // TODO(golang.org/issue/38337): These should be moved into a struct as
    72  // part of the effort to reduce package level variables.
    73  var (
    74  	buildEnv *buildenv.Environment
    75  
    76  	// dsClient is a datastore client for the build project (symbolic-datum-552), where build progress is stored.
    77  	dsClient *datastore.Client
    78  	// goDSClient is a datastore client for golang-org, where build status is stored.
    79  	goDSClient *datastore.Client
    80  	// oAuthHTTPClient is the OAuth2 HTTP client used to make API calls to Google Cloud APIs.
    81  	oAuthHTTPClient *http.Client
    82  	computeService  *compute.Service
    83  	gcpCreds        *google.Credentials
    84  	errTryDeps      error // non-nil if try bots are disabled
    85  	gerritClient    *gerrit.Client
    86  	storageClient   *storage.Client
    87  	inStaging       bool                   // are we running in the staging project? (named -dev)
    88  	errorsClient    *errorreporting.Client // Stackdriver errors client
    89  	gkeNodeHostname string
    90  
    91  	// values created due to separating the buildlet pools into a separate package
    92  	gceMode          string
    93  	basePinErr       *atomic.Value
    94  	isRemoteBuildlet IsRemoteBuildletFunc
    95  )
    96  
    97  // InitGCE initializes the GCE buildlet pool.
    98  func InitGCE(sc *secret.Client, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error {
    99  	gceMode = mode
   100  	basePinErr = basePin
   101  	isRemoteBuildlet = fn
   102  
   103  	ctx := context.Background()
   104  	var err error
   105  
   106  	// If the coordinator is running on a GCE instance and a
   107  	// buildEnv was not specified with the env flag, set the
   108  	// buildEnvName to the project ID
   109  	if buildEnvName == "" {
   110  		if mode == "dev" {
   111  			buildEnvName = "dev"
   112  		} else if metadata.OnGCE() {
   113  			buildEnvName, err = metadata.ProjectID()
   114  			if err != nil {
   115  				log.Fatalf("metadata.ProjectID: %v", err)
   116  			}
   117  		}
   118  	}
   119  
   120  	buildEnv = buildenv.ByProjectID(buildEnvName)
   121  	inStaging = buildEnv == buildenv.Staging
   122  
   123  	// If running on GCE, override the zone and static IP, and check service account permissions.
   124  	if metadata.OnGCE() {
   125  		gkeNodeHostname, err = metadata.Get("instance/hostname")
   126  		if err != nil {
   127  			return fmt.Errorf("failed to get current instance hostname: %v", err)
   128  		}
   129  
   130  		if len(buildEnv.VMZones) == 0 || buildEnv.VMRegion == "" {
   131  			projectZone, err := metadata.Get("instance/zone")
   132  			if err != nil || projectZone == "" {
   133  				return fmt.Errorf("failed to get current GCE zone: %v", err)
   134  			}
   135  			// Convert the zone from "projects/1234/zones/us-central1-a" to "us-central1-a".
   136  			projectZone = path.Base(projectZone)
   137  			if len(buildEnv.VMZones) == 0 {
   138  				buildEnv.VMZones = []string{projectZone}
   139  			}
   140  			if buildEnv.VMRegion == "" {
   141  				buildEnv.VMRegion = strings.Join(strings.Split(projectZone, "-")[:2], "-")
   142  			}
   143  		}
   144  
   145  		if buildEnv.StaticIP == "" {
   146  			buildEnv.StaticIP, err = metadata.ExternalIP()
   147  			if err != nil {
   148  				return fmt.Errorf("ExternalIP: %v", err)
   149  			}
   150  		}
   151  
   152  		if !hasComputeScope() {
   153  			return errors.New("coordinator is not running with access to read and write Compute resources. VM support disabled")
   154  		}
   155  	}
   156  
   157  	cfgDump, _ := json.MarshalIndent(buildEnv, "", "  ")
   158  	log.Printf("Loaded configuration %q for project %q:\n%s", buildEnvName, buildEnv.ProjectName, cfgDump)
   159  
   160  	if mode != "dev" {
   161  		storageClient, err = storage.NewClient(ctx)
   162  		if err != nil {
   163  			log.Fatalf("storage.NewClient: %v", err)
   164  		}
   165  	}
   166  
   167  	dsClient, err = datastore.NewClient(ctx, buildEnv.ProjectName)
   168  	if err != nil {
   169  		if mode == "dev" {
   170  			log.Printf("Error creating datastore client for %q: %v", buildEnv.ProjectName, err)
   171  		} else {
   172  			log.Fatalf("Error creating datastore client for %q: %v", buildEnv.ProjectName, err)
   173  		}
   174  	}
   175  	goDSClient, err = datastore.NewClient(ctx, buildEnv.GoProjectName)
   176  	if err != nil {
   177  		if mode == "dev" {
   178  			log.Printf("Error creating datastore client for %q: %v", buildEnv.GoProjectName, err)
   179  		} else {
   180  			log.Fatalf("Error creating datastore client for %q: %v", buildEnv.GoProjectName, err)
   181  		}
   182  	}
   183  
   184  	// don't send dev errors to Stackdriver.
   185  	if mode != "dev" {
   186  		errorsClient, err = errorreporting.NewClient(ctx, buildEnv.ProjectName, errorreporting.Config{
   187  			ServiceName: "coordinator",
   188  		})
   189  		if err != nil {
   190  			// don't exit, we still want to run coordinator
   191  			log.Printf("Error creating errors client: %v", err)
   192  		}
   193  	}
   194  
   195  	gcpCreds, err = buildEnv.Credentials(ctx)
   196  	if err != nil {
   197  		if mode == "dev" {
   198  			// don't try to do anything else with GCE, as it will likely fail
   199  			return nil
   200  		}
   201  		log.Fatalf("failed to get a token source: %v", err)
   202  	}
   203  	oAuthHTTPClient = oauth2.NewClient(ctx, gcpCreds.TokenSource)
   204  	computeService, _ = compute.New(oAuthHTTPClient)
   205  	errTryDeps = checkTryBuildDeps(ctx, sc)
   206  	if errTryDeps != nil {
   207  		log.Printf("TryBot builders disabled due to error: %v", errTryDeps)
   208  	} else {
   209  		log.Printf("TryBot builders enabled.")
   210  	}
   211  
   212  	if mode != "dev" && metadata.OnGCE() && (buildEnv == buildenv.Production || buildEnv == buildenv.Staging) {
   213  		go syncBuildStatsLoop(buildEnv)
   214  		go gcePool.pollQuotaLoop()
   215  		go createBasepinDisks(ctx)
   216  	}
   217  
   218  	return nil
   219  }
   220  
   221  // StorageClient retrieves the GCE storage client.
   222  func StorageClient(ctx context.Context) (*storage.Client, error) {
   223  	sc, err := storage.NewClient(ctx)
   224  	if err != nil {
   225  		return nil, fmt.Errorf("storage.NewClient: %w", err)
   226  	}
   227  	return sc, nil
   228  }
   229  
   230  // TODO(golang.org/issue/38337): These should be moved into a struct as
   231  // part of the effort to reduce package level variables.
   232  
   233  // GCEConfiguration manages and contains all of the GCE configuration.
   234  type GCEConfiguration struct{}
   235  
   236  // NewGCEConfiguration creates a new GCEConfiguration.
   237  func NewGCEConfiguration() *GCEConfiguration { return &GCEConfiguration{} }
   238  
   239  // StorageClient retrieves the GCE storage client.
   240  func (c *GCEConfiguration) StorageClient() *storage.Client {
   241  	return storageClient
   242  }
   243  
   244  // BuildEnv retrieves the GCE build env.
   245  func (c *GCEConfiguration) BuildEnv() *buildenv.Environment {
   246  	return buildEnv
   247  }
   248  
   249  // SetBuildEnv sets the GCE build env. This is primarily reserved for
   250  // testing purposes.
   251  func (c *GCEConfiguration) SetBuildEnv(b *buildenv.Environment) {
   252  	buildEnv = b
   253  }
   254  
   255  // BuildletPool retrieves the GCE buildlet pool.
   256  func (c *GCEConfiguration) BuildletPool() *GCEBuildlet {
   257  	return gcePool
   258  }
   259  
   260  // InStaging returns a boolean denoting if the environment is staging.
   261  func (c *GCEConfiguration) InStaging() bool {
   262  	return inStaging
   263  }
   264  
   265  // GerritClient retrieves a gerrit client.
   266  func (c *GCEConfiguration) GerritClient() *gerrit.Client {
   267  	return gerritClient
   268  }
   269  
   270  // GKENodeHostname retrieves the GKE node hostname.
   271  func (c *GCEConfiguration) GKENodeHostname() string {
   272  	return gkeNodeHostname
   273  }
   274  
   275  // DSClient retrieves the datastore client.
   276  func (c *GCEConfiguration) DSClient() *datastore.Client {
   277  	return dsClient
   278  }
   279  
   280  // GoDSClient retrieves the datastore client for golang.org project.
   281  func (c *GCEConfiguration) GoDSClient() *datastore.Client {
   282  	return goDSClient
   283  }
   284  
   285  // TryDepsErr retrieves any Trybot dependency error.
   286  func (c *GCEConfiguration) TryDepsErr() error {
   287  	return errTryDeps
   288  }
   289  
   290  // ErrorsClient retrieves the stackdriver errors client.
   291  func (c *GCEConfiguration) ErrorsClient() *errorreporting.Client {
   292  	return errorsClient
   293  }
   294  
   295  // OAuthHTTPClient retrieves an OAuth2 HTTP client used to make API calls to GCP.
   296  func (c *GCEConfiguration) OAuthHTTPClient() *http.Client {
   297  	return oAuthHTTPClient
   298  }
   299  
   300  // GCPCredentials retrieves the GCP credentials.
   301  func (c *GCEConfiguration) GCPCredentials() *google.Credentials {
   302  	return gcpCreds
   303  }
   304  
   305  func checkTryBuildDeps(ctx context.Context, sc *secret.Client) error {
   306  	if !hasStorageScope() {
   307  		return errors.New("coordinator's GCE instance lacks the storage service scope")
   308  	}
   309  	if gceMode == "dev" {
   310  		return errors.New("running in dev mode")
   311  	}
   312  	wr := storageClient.Bucket(buildEnv.LogBucket).Object("hello.txt").NewWriter(context.Background())
   313  	fmt.Fprintf(wr, "Hello, world! Coordinator start-up at %v", time.Now())
   314  	if err := wr.Close(); err != nil {
   315  		return fmt.Errorf("test write of a GCS object to bucket %q failed: %v", buildEnv.LogBucket, err)
   316  	}
   317  	if inStaging {
   318  		// Don't expect to write to Gerrit in staging mode.
   319  		gerritClient = gerrit.NewClient("https://go-review.googlesource.com", gerrit.NoAuth)
   320  	} else {
   321  		ctxSec, cancel := context.WithTimeout(ctx, 10*time.Second)
   322  		defer cancel()
   323  
   324  		gobotPass, err := sc.Retrieve(ctxSec, secret.NameGobotPassword)
   325  		if err != nil {
   326  			return fmt.Errorf("failed to get project metadata 'gobot-password': %v", err)
   327  		}
   328  		gerritClient = gerrit.NewClient("https://go-review.googlesource.com",
   329  			gerrit.BasicAuth("git-gobot.golang.org", strings.TrimSpace(string(gobotPass))))
   330  	}
   331  
   332  	return nil
   333  }
   334  
   335  var gcePool = &GCEBuildlet{
   336  	c2cpuQueue:  queue.NewQuota(),
   337  	cpuQueue:    queue.NewQuota(),
   338  	instQueue:   queue.NewQuota(),
   339  	n2cpuQueue:  queue.NewQuota(),
   340  	n2dcpuQueue: queue.NewQuota(),
   341  	t2acpuQueue: queue.NewQuota(),
   342  }
   343  
   344  var _ Buildlet = (*GCEBuildlet)(nil)
   345  
   346  // GCEBuildlet manages a pool of GCE buildlets.
   347  type GCEBuildlet struct {
   348  	mu sync.Mutex // guards all following
   349  
   350  	disabled bool
   351  
   352  	// CPU quota usage & limits. pollQuota updates quotas periodically.
   353  	// The values recorded here reflect the updates as well as our own
   354  	// bookkeeping of instances as they are created and destroyed.
   355  	c2cpuQueue  *queue.Quota
   356  	cpuQueue    *queue.Quota
   357  	instQueue   *queue.Quota
   358  	n2cpuQueue  *queue.Quota
   359  	n2dcpuQueue *queue.Quota
   360  	t2acpuQueue *queue.Quota
   361  	inst        map[string]time.Time // GCE VM instance name -> creationTime
   362  }
   363  
   364  func (p *GCEBuildlet) pollQuotaLoop() {
   365  	for {
   366  		p.pollQuota()
   367  		time.Sleep(time.Minute)
   368  	}
   369  }
   370  
   371  // pollQuota updates cpu usage and limits from the compute API.
   372  func (p *GCEBuildlet) pollQuota() {
   373  	gceAPIGate()
   374  	reg, err := computeService.Regions.Get(buildEnv.ProjectName, buildEnv.VMRegion).Do()
   375  	if err != nil {
   376  		log.Printf("Failed to get quota for %s/%s: %v", buildEnv.ProjectName, buildEnv.VMRegion, err)
   377  		return
   378  	}
   379  
   380  	if err := p.updateUntrackedQuota(); err != nil {
   381  		log.Printf("Failed to update quota used by other instances: %q", err)
   382  	}
   383  	for _, quota := range reg.Quotas {
   384  		switch quota.Metric {
   385  		case "CPUS":
   386  			p.cpuQueue.UpdateLimit(int(quota.Limit))
   387  		case "C2_CPUS":
   388  			p.c2cpuQueue.UpdateLimit(int(quota.Limit))
   389  		case "N2_CPUS":
   390  			p.n2cpuQueue.UpdateLimit(int(quota.Limit))
   391  		case "N2D_CPUS":
   392  			p.n2dcpuQueue.UpdateLimit(int(quota.Limit))
   393  		case "T2A_CPUS":
   394  			p.t2acpuQueue.UpdateLimit(int(quota.Limit))
   395  		case "INSTANCES":
   396  			p.instQueue.UpdateLimit(int(quota.Limit))
   397  		}
   398  	}
   399  }
   400  
   401  func (p *GCEBuildlet) QuotaStats() map[string]*queue.QuotaStats {
   402  	return map[string]*queue.QuotaStats{
   403  		"gce-cpu":       p.cpuQueue.ToExported(),
   404  		"gce-c2-cpu":    p.c2cpuQueue.ToExported(),
   405  		"gce-n2-cpu":    p.n2cpuQueue.ToExported(),
   406  		"gce-n2d-cpu":   p.n2dcpuQueue.ToExported(),
   407  		"gce-t2a-cpu":   p.t2acpuQueue.ToExported(),
   408  		"gce-instances": p.instQueue.ToExported(),
   409  	}
   410  }
   411  
   412  func (p *GCEBuildlet) updateUntrackedQuota() error {
   413  	untrackedQuotas := make(map[*queue.Quota]int)
   414  	for _, zone := range buildEnv.VMZones {
   415  		gceAPIGate()
   416  		err := computeService.Instances.List(buildEnv.ProjectName, zone).Pages(context.Background(), func(list *compute.InstanceList) error {
   417  			for _, inst := range list.Items {
   418  				if isBuildlet(inst.Name) {
   419  					continue
   420  				}
   421  				untrackedQuotas[p.queueForMachineType(inst.MachineType)] += GCENumCPU(inst.MachineType)
   422  			}
   423  			if list.NextPageToken != "" {
   424  				// Don't use all our quota flipping through pages.
   425  				gceAPIGate()
   426  			}
   427  			return nil
   428  		})
   429  		if err != nil {
   430  			return err
   431  		}
   432  	}
   433  	for quota, num := range untrackedQuotas {
   434  		quota.UpdateUntracked(num)
   435  	}
   436  	return nil
   437  }
   438  
   439  // SetEnabled marks the buildlet pool as enabled.
   440  func (p *GCEBuildlet) SetEnabled(enabled bool) {
   441  	p.mu.Lock()
   442  	defer p.mu.Unlock()
   443  	p.disabled = !enabled
   444  }
   445  
   446  // GetBuildlet retrieves a buildlet client for an available buildlet.
   447  func (p *GCEBuildlet) GetBuildlet(ctx context.Context, hostType string, lg Logger, si *queue.SchedItem) (bc buildlet.Client, err error) {
   448  	if p.disabled {
   449  		return nil, errors.New("pool disabled by configuration")
   450  	}
   451  	hconf, ok := dashboard.Hosts[hostType]
   452  	if !ok {
   453  		return nil, fmt.Errorf("gcepool: unknown host type %q", hostType)
   454  	}
   455  	qsp := lg.CreateSpan("awaiting_gce_quota")
   456  	instItem := p.instQueue.Enqueue(1, si)
   457  	if err := instItem.Await(ctx); err != nil {
   458  		return nil, err
   459  	}
   460  	cpuItem := p.queueForMachineType(hconf.MachineType()).Enqueue(GCENumCPU(hconf.MachineType()), si)
   461  	err = cpuItem.Await(ctx)
   462  	qsp.Done(err)
   463  	if err != nil {
   464  		// return unused quota
   465  		instItem.ReturnQuota()
   466  		return nil, err
   467  	}
   468  
   469  	instName := instanceName(hostType, 7)
   470  	instName = strings.Replace(instName, "_", "-", -1) // Issue 22905; can't use underscores in GCE VMs
   471  	p.setInstanceUsed(instName, true)
   472  
   473  	gceBuildletSpan := lg.CreateSpan("create_gce_buildlet", instName)
   474  	defer func() { gceBuildletSpan.Done(err) }()
   475  
   476  	var (
   477  		needDelete   bool
   478  		createSpan   = lg.CreateSpan("create_gce_instance", instName)
   479  		waitBuildlet spanlog.Span // made after create is done
   480  		curSpan      = createSpan // either instSpan or waitBuildlet
   481  	)
   482  
   483  	zone := buildEnv.RandomVMZone()
   484  	cleanup := func() {
   485  		if needDelete {
   486  			deleteVM(zone, instName)
   487  		}
   488  		instItem.ReturnQuota()
   489  		cpuItem.ReturnQuota()
   490  		p.setInstanceUsed(instName, false)
   491  	}
   492  
   493  	log.Printf("Creating GCE VM %q for %s at %s", instName, hostType, zone)
   494  	attempts := 1
   495  	for {
   496  		bc, err = buildlet.StartNewVM(gcpCreds, buildEnv, instName, hostType, buildlet.VMOpts{
   497  			DeleteIn: determineDeleteTimeout(hconf),
   498  			OnInstanceRequested: func() {
   499  				log.Printf("GCE VM %q now booting", instName)
   500  			},
   501  			OnInstanceCreated: func() {
   502  				needDelete = true
   503  
   504  				createSpan.Done(nil)
   505  				waitBuildlet = lg.CreateSpan("wait_buildlet_start", instName)
   506  				curSpan = waitBuildlet
   507  			},
   508  			OnGotInstanceInfo: func(*compute.Instance) {
   509  				lg.LogEventTime("got_instance_info", "waiting_for_buildlet...")
   510  			},
   511  			Zone:       zone,
   512  			DiskSizeGB: hconf.RootDriveSizeGB,
   513  		})
   514  		if errors.Is(err, buildlet.ErrQuotaExceeded) && ctx.Err() == nil {
   515  			log.Printf("Failed to create VM because quota exceeded. Retrying after 10 second (attempt: %d).", attempts)
   516  			attempts++
   517  			time.Sleep(10 * time.Second)
   518  			continue
   519  		} else if err != nil {
   520  			curSpan.Done(err)
   521  			log.Printf("Failed to create VM for %s at %s: %v", hostType, zone, err)
   522  			cleanup()
   523  			return nil, err
   524  		}
   525  		break
   526  	}
   527  	waitBuildlet.Done(nil)
   528  	bc.SetDescription("GCE VM: " + instName)
   529  	bc.SetInstanceName(instName)
   530  	bc.SetOnHeartbeatFailure(cleanup)
   531  	return bc, nil
   532  }
   533  
   534  // WriteHTMLStatus writes the status of the buildlet pool to an io.Writer.
   535  func (p *GCEBuildlet) WriteHTMLStatus(w io.Writer) {
   536  	fmt.Fprintf(w, "<b>GCE pool</b> capacity: %s", p.capacityString())
   537  	const show = 6 // must be even
   538  	active := p.instancesActive()
   539  	if len(active) > 0 {
   540  		fmt.Fprintf(w, "<ul>")
   541  		for i, inst := range active {
   542  			if i < show/2 || i >= len(active)-(show/2) {
   543  				fmt.Fprintf(w, "<li>%v, %s</li>\n", inst.Name, friendlyDuration(time.Since(inst.Creation)))
   544  			} else if i == show/2 {
   545  				fmt.Fprintf(w, "<li>... %d of %d total omitted ...</li>\n", len(active)-show, len(active))
   546  			}
   547  		}
   548  		fmt.Fprintf(w, "</ul>")
   549  	}
   550  }
   551  
   552  func (p *GCEBuildlet) String() string {
   553  	return fmt.Sprintf("GCE pool capacity: %s", p.capacityString())
   554  }
   555  
   556  func (p *GCEBuildlet) capacityString() string {
   557  	cpuUsage := p.cpuQueue.Quotas()
   558  	c2Usage := p.c2cpuQueue.Quotas()
   559  	instUsage := p.instQueue.Quotas()
   560  	n2Usage := p.n2cpuQueue.Quotas()
   561  	n2dUsage := p.n2dcpuQueue.Quotas()
   562  	t2aUsage := p.t2acpuQueue.Quotas()
   563  	return fmt.Sprintf("%d/%d instances; %d/%d CPUs, %d/%d C2_CPUS, %d/%d N2_CPUS, %d/%d N2D_CPUS %d/%d T2A_CPUS",
   564  		instUsage.Used, instUsage.Limit,
   565  		cpuUsage.Used, cpuUsage.Limit,
   566  		c2Usage.Used, c2Usage.Limit,
   567  		n2Usage.Used, n2Usage.Limit,
   568  		n2dUsage.Used, n2dUsage.Limit,
   569  		t2aUsage.Used, t2aUsage.Limit)
   570  }
   571  
   572  func (p *GCEBuildlet) queueForMachineType(mt string) *queue.Quota {
   573  	if strings.HasPrefix(mt, "n2-") {
   574  		return p.n2cpuQueue
   575  	} else if strings.HasPrefix(mt, "n2d-") {
   576  		return p.n2dcpuQueue
   577  	} else if strings.HasPrefix(mt, "c2-") {
   578  		return p.c2cpuQueue
   579  	} else if strings.HasPrefix(mt, "t2a-") {
   580  		return p.t2acpuQueue
   581  	} else {
   582  		// E2 and N1 instances are counted here. We do not use M1, M2,
   583  		// or A2 quotas. See
   584  		// https://cloud.google.com/compute/quotas#cpu_quota.
   585  		return p.cpuQueue
   586  	}
   587  }
   588  
   589  // returnQuota adjusts the dead-reckoning of our quota usage by
   590  // one instance and cpu CPUs.
   591  func (p *GCEBuildlet) returnQuota(hconf *dashboard.HostConfig) {
   592  	machineType := hconf.MachineType()
   593  	p.queueForMachineType(hconf.MachineType()).ReturnQuota(GCENumCPU(machineType))
   594  	p.instQueue.ReturnQuota(1)
   595  }
   596  
   597  func (p *GCEBuildlet) setInstanceUsed(instName string, used bool) {
   598  	p.mu.Lock()
   599  	defer p.mu.Unlock()
   600  	if p.inst == nil {
   601  		p.inst = make(map[string]time.Time)
   602  	}
   603  	if used {
   604  		p.inst[instName] = time.Now()
   605  	} else {
   606  		delete(p.inst, instName)
   607  	}
   608  }
   609  
   610  func (p *GCEBuildlet) instanceUsed(instName string) bool {
   611  	p.mu.Lock()
   612  	defer p.mu.Unlock()
   613  	_, ok := p.inst[instName]
   614  	return ok
   615  }
   616  
   617  func (p *GCEBuildlet) instancesActive() (ret []ResourceTime) {
   618  	p.mu.Lock()
   619  	defer p.mu.Unlock()
   620  	for name, create := range p.inst {
   621  		ret = append(ret, ResourceTime{
   622  			Name:     name,
   623  			Creation: create,
   624  		})
   625  	}
   626  	sort.Sort(ByCreationTime(ret))
   627  	return ret
   628  }
   629  
   630  // ResourceTime is a GCE instance or Kube pod name and its creation time.
   631  type ResourceTime struct {
   632  	Name     string
   633  	Creation time.Time
   634  }
   635  
   636  // ByCreationTime provides the functionality to sort resource times by
   637  // the time of creation.
   638  type ByCreationTime []ResourceTime
   639  
   640  func (s ByCreationTime) Len() int           { return len(s) }
   641  func (s ByCreationTime) Less(i, j int) bool { return s[i].Creation.Before(s[j].Creation) }
   642  func (s ByCreationTime) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
   643  
   644  // CleanUpOldVMs loops forever and periodically enumerates virtual
   645  // machines and deletes those which have expired.
   646  //
   647  // A VM is considered expired if it has a "delete-at" metadata
   648  // attribute having a unix timestamp before the current time.
   649  //
   650  // This is the safety mechanism to delete VMs which stray from the
   651  // normal deleting process. VMs are created to run a single build and
   652  // should be shut down by a controlling process. Due to various types
   653  // of failures, they might get stranded. To prevent them from getting
   654  // stranded and wasting resources forever, we instead set the
   655  // "delete-at" metadata attribute on them when created to some time
   656  // that's well beyond their expected lifetime.
   657  func (p *GCEBuildlet) CleanUpOldVMs() {
   658  	if gceMode == "dev" {
   659  		return
   660  	}
   661  	if computeService == nil {
   662  		return
   663  	}
   664  
   665  	// TODO(bradfitz): remove this list and just query it from the compute API?
   666  	// https://godoc.org/google.golang.org/api/compute/v1#RegionsService.Get
   667  	// and Region.Zones: https://godoc.org/google.golang.org/api/compute/v1#Region
   668  
   669  	for {
   670  		for _, zone := range buildEnv.VMZones {
   671  			if err := p.cleanZoneVMs(zone); err != nil {
   672  				log.Printf("Error cleaning VMs in zone %q: %v", zone, err)
   673  			}
   674  		}
   675  		time.Sleep(time.Minute)
   676  	}
   677  }
   678  
   679  // cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone.
   680  func (p *GCEBuildlet) cleanZoneVMs(zone string) error {
   681  	deletionAPIGate()
   682  	err := computeService.Instances.List(buildEnv.ProjectName, zone).Pages(context.Background(), func(list *compute.InstanceList) error {
   683  		for _, inst := range list.Items {
   684  			if inst.Metadata == nil {
   685  				// Defensive. Not seen in practice.
   686  				continue
   687  			}
   688  			if isRemoteBuildlet(inst.Name) {
   689  				// Remote buildlets have their own expiration mechanism that respects active SSH sessions.
   690  				log.Printf("cleanZoneVMs: skipping remote buildlet %q", inst.Name)
   691  				continue
   692  			}
   693  			var sawDeleteAt bool
   694  			var deleteReason string
   695  			for _, it := range inst.Metadata.Items {
   696  				if it.Key == "delete-at" {
   697  					if it.Value == nil {
   698  						log.Printf("missing delete-at value; ignoring")
   699  						continue
   700  					}
   701  					unixDeadline, err := strconv.ParseInt(*it.Value, 10, 64)
   702  					if err != nil {
   703  						log.Printf("invalid delete-at value %q seen; ignoring", *it.Value)
   704  						continue
   705  					}
   706  					sawDeleteAt = true
   707  					if time.Now().Unix() > unixDeadline {
   708  						deleteReason = "delete-at expiration"
   709  					}
   710  				}
   711  			}
   712  			isBuildlet := isBuildlet(inst.Name)
   713  
   714  			if isBuildlet && !sawDeleteAt && !p.instanceUsed(inst.Name) {
   715  				createdAt, _ := time.Parse(time.RFC3339Nano, inst.CreationTimestamp)
   716  				if createdAt.Before(time.Now().Add(-3 * time.Hour)) {
   717  					deleteReason = fmt.Sprintf("no delete-at, created at %s", inst.CreationTimestamp)
   718  				}
   719  			}
   720  
   721  			// Delete buildlets (things we made) from previous
   722  			// generations. Only deleting things starting with "buildlet-"
   723  			// is a historical restriction, but still fine for paranoia.
   724  			if deleteReason == "" && sawDeleteAt && isBuildlet && !p.instanceUsed(inst.Name) {
   725  				if _, ok := deletedVMCache.Get(inst.Name); !ok {
   726  					deleteReason = "from earlier coordinator generation"
   727  				}
   728  			}
   729  
   730  			if deleteReason != "" {
   731  				log.Printf("deleting VM %q in zone %q; %s ...", inst.Name, zone, deleteReason)
   732  				deleteVM(zone, inst.Name)
   733  			}
   734  		}
   735  		if list.NextPageToken != "" {
   736  			// Don't use all our quota flipping through pages.
   737  			deletionAPIGate()
   738  		}
   739  		return nil
   740  	})
   741  	if err != nil {
   742  		return fmt.Errorf("listing instances: %v", err)
   743  	}
   744  	return nil
   745  }
   746  
   747  var deletedVMCache = lru.New(100) // keyed by instName
   748  
   749  type token struct{}
   750  
   751  // deleteVM starts a delete of an instance in a given zone.
   752  //
   753  // It either returns an operation name (if delete is pending) or the
   754  // empty string if the instance didn't exist.
   755  func deleteVM(zone, instName string) (operation string, err error) {
   756  	deletedVMCache.Add(instName, token{})
   757  	deletionAPIGate()
   758  	op, err := computeService.Instances.Delete(buildEnv.ProjectName, zone, instName).Do()
   759  	apiErr, ok := err.(*googleapi.Error)
   760  	if ok {
   761  		if apiErr.Code == 404 {
   762  			return "", nil
   763  		}
   764  	}
   765  	if err != nil {
   766  		log.Printf("Failed to delete instance %q in zone %q: %v", instName, zone, err)
   767  		return "", err
   768  	}
   769  	log.Printf("Sent request to delete instance %q in zone %q. Operation ID, Name: %v, %v", instName, zone, op.Id, op.Name)
   770  	return op.Name, nil
   771  }
   772  
   773  // HasScope returns true if the GCE metadata contains the default scopes.
   774  func HasScope(want string) bool {
   775  	// If not on GCE, assume full access
   776  	if !metadata.OnGCE() {
   777  		return true
   778  	}
   779  	scopes, err := metadata.Scopes("default")
   780  	if err != nil {
   781  		log.Printf("failed to query metadata default scopes: %v", err)
   782  		return false
   783  	}
   784  	for _, v := range scopes {
   785  		if v == want {
   786  			return true
   787  		}
   788  	}
   789  	return false
   790  }
   791  
   792  func hasComputeScope() bool {
   793  	return HasScope(compute.ComputeScope) || HasScope(compute.CloudPlatformScope)
   794  }
   795  
   796  func hasStorageScope() bool {
   797  	return HasScope(storage.ScopeReadWrite) || HasScope(storage.ScopeFullControl) || HasScope(compute.CloudPlatformScope)
   798  }
   799  
   800  // syncBuildStatsLoop runs forever in its own goroutine and syncs the
   801  // coordinator's datastore Build & Span entities to BigQuery
   802  // periodically.
   803  func syncBuildStatsLoop(env *buildenv.Environment) {
   804  	ticker := time.NewTicker(5 * time.Minute)
   805  	for {
   806  		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
   807  		if err := buildstats.SyncBuilds(ctx, env); err != nil {
   808  			log.Printf("buildstats: SyncBuilds: %v", err)
   809  		}
   810  		if err := buildstats.SyncSpans(ctx, env); err != nil {
   811  			log.Printf("buildstats: SyncSpans: %v", err)
   812  		}
   813  		cancel()
   814  		<-ticker.C
   815  	}
   816  }
   817  
   818  // createBasepinDisks creates zone-local copies of VM disk images, to
   819  // speed up VM creations in the future.
   820  //
   821  // Other than a list call, this a no-op unless new VM images were
   822  // added or updated recently.
   823  func createBasepinDisks(ctx context.Context) {
   824  	for {
   825  		t0 := time.Now()
   826  		bgc, err := buildgo.NewClient(ctx, buildEnv)
   827  		if err != nil {
   828  			log.Printf("basepin: NewClient: %v", err)
   829  			return
   830  		}
   831  		log.Printf("basepin: creating basepin disks...")
   832  		err = bgc.MakeBasepinDisks(ctx)
   833  		d := time.Since(t0).Round(time.Second / 10)
   834  		if err != nil {
   835  			basePinErr.Store(err.Error())
   836  			log.Printf("basepin: error creating basepin disks, after %v: %v", d, err)
   837  			time.Sleep(5 * time.Minute)
   838  			continue
   839  		}
   840  		basePinErr.Store("")
   841  		log.Printf("basepin: created basepin disks after %v", d)
   842  		return
   843  	}
   844  }
   845  
   846  // GCENumCPU returns the number of GCE CPUs used by the specified machine type.
   847  func GCENumCPU(machineType string) int {
   848  	if strings.HasSuffix(machineType, "e2-medium") || strings.HasSuffix(machineType, "e2-small") || strings.HasSuffix(machineType, "e2-micro") {
   849  		return 2
   850  	}
   851  	n, _ := strconv.Atoi(machineType[strings.LastIndex(machineType, "-")+1:])
   852  	return n
   853  }