golang.org/x/build@v0.0.0-20240506185731-218518f32b70/buildlet/gce.go (about)

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package buildlet
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"encoding/json"
    11  	"errors"
    12  	"fmt"
    13  	"io"
    14  	"net"
    15  	"os"
    16  	"os/exec"
    17  	"regexp"
    18  	"sort"
    19  	"strings"
    20  	"sync"
    21  	"time"
    22  
    23  	"golang.org/x/build/buildenv"
    24  	"golang.org/x/build/dashboard"
    25  	"golang.org/x/oauth2"
    26  	"golang.org/x/oauth2/google"
    27  	"google.golang.org/api/compute/v1"
    28  )
    29  
    30  // GCEGate optionally specifies a function to run before any GCE API call.
    31  // It's intended to be used to bound QPS rate to GCE.
    32  var GCEGate func()
    33  
    34  func apiGate() {
    35  	if GCEGate != nil {
    36  		GCEGate()
    37  	}
    38  }
    39  
    40  // ErrQuotaExceeded matches errors.Is when VM creation fails with a
    41  // quota error. Currently, it only supports GCE quota errors.
    42  var ErrQuotaExceeded = errors.New("quota exceeded")
    43  
    44  type GCEError struct {
    45  	OpErrors []*compute.OperationErrorErrors
    46  }
    47  
    48  func (q *GCEError) Error() string {
    49  	var buf bytes.Buffer
    50  	fmt.Fprintf(&buf, "%d GCE operation errors: ", len(q.OpErrors))
    51  	for i, e := range q.OpErrors {
    52  		if i != 0 {
    53  			buf.WriteString("; ")
    54  		}
    55  		b, err := json.Marshal(e)
    56  		if err != nil {
    57  			fmt.Fprintf(&buf, "json.Marshal(OpErrors[%d]): %v", i, err)
    58  			continue
    59  		}
    60  		buf.Write(b)
    61  	}
    62  	return buf.String()
    63  }
    64  
    65  func (q *GCEError) Is(target error) bool {
    66  	for _, err := range q.OpErrors {
    67  		if target == ErrQuotaExceeded && err.Code == "QUOTA_EXCEEDED" {
    68  			return true
    69  		}
    70  	}
    71  	return false
    72  }
    73  
    74  // StartNewVM boots a new VM on GCE and returns a buildlet client
    75  // configured to speak to it.
    76  func StartNewVM(creds *google.Credentials, buildEnv *buildenv.Environment, instName, hostType string, opts VMOpts) (Client, error) {
    77  	ctx := context.TODO()
    78  	computeService, _ := compute.New(oauth2.NewClient(ctx, creds.TokenSource))
    79  
    80  	if opts.Description == "" {
    81  		opts.Description = fmt.Sprintf("Go Builder for %s", hostType)
    82  	}
    83  	if opts.ProjectID == "" {
    84  		opts.ProjectID = buildEnv.ProjectName
    85  	}
    86  	if opts.Zone == "" {
    87  		opts.Zone = buildEnv.RandomVMZone()
    88  	}
    89  	zone := opts.Zone
    90  	if opts.DeleteIn == 0 {
    91  		opts.DeleteIn = 30 * time.Minute
    92  	}
    93  
    94  	hconf, ok := dashboard.Hosts[hostType]
    95  	if !ok {
    96  		return nil, fmt.Errorf("invalid host type %q", hostType)
    97  	}
    98  	if !hconf.IsVM() && !hconf.IsContainer() {
    99  		return nil, fmt.Errorf("host %q is type %q; want either a VM or container type", hostType, hconf.PoolName())
   100  	}
   101  
   102  	projectID := opts.ProjectID
   103  	if projectID == "" {
   104  		return nil, errors.New("buildlet: missing required ProjectID option")
   105  	}
   106  
   107  	prefix := "https://www.googleapis.com/compute/v1/projects/" + projectID
   108  	machType := prefix + "/zones/" + zone + "/machineTypes/" + hconf.MachineType()
   109  	diskType := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/zones/" + zone + "/diskTypes/pd-ssd"
   110  	if hconf.RegularDisk {
   111  		diskType = "" // a spinning disk
   112  	}
   113  
   114  	srcImage := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + hconf.VMImage
   115  	minCPU := hconf.MinCPUPlatform
   116  	if hconf.IsContainer() {
   117  		if hconf.NestedVirt {
   118  			minCPU = "Intel Cascade Lake" // n2 vms (which support NestedVirtualization) are either Ice Lake or Cascade Lake.
   119  		}
   120  		if vm := hconf.ContainerVMImage(); vm != "" {
   121  			srcImage = "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + vm
   122  		} else {
   123  			var err error
   124  			srcImage, err = cosImage(ctx, computeService, hconf.CosArchitecture())
   125  			if err != nil {
   126  				return nil, fmt.Errorf("error find Container-Optimized OS image: %v", err)
   127  			}
   128  		}
   129  	}
   130  
   131  	instance := &compute.Instance{
   132  		Name:           instName,
   133  		Description:    opts.Description,
   134  		MachineType:    machType,
   135  		MinCpuPlatform: minCPU,
   136  		Disks: []*compute.AttachedDisk{
   137  			{
   138  				AutoDelete: true,
   139  				Boot:       true,
   140  				Type:       "PERSISTENT",
   141  				InitializeParams: &compute.AttachedDiskInitializeParams{
   142  					DiskName:    instName,
   143  					SourceImage: srcImage,
   144  					DiskType:    diskType,
   145  					DiskSizeGb:  opts.DiskSizeGB,
   146  				},
   147  			},
   148  		},
   149  		Tags: &compute.Tags{
   150  			// Warning: do NOT list "http-server" or "allow-ssh" (our
   151  			// project's custom tag to allow ssh access) here; the
   152  			// buildlet provides full remote code execution.
   153  			// The https-server is authenticated, though.
   154  			Items: []string{"https-server"},
   155  		},
   156  		Metadata: &compute.Metadata{},
   157  		NetworkInterfaces: []*compute.NetworkInterface{{
   158  			Network: prefix + "/global/networks/default-vpc",
   159  		}},
   160  
   161  		// Prior to git rev 1b1e086fd, we used preemptible
   162  		// instances, as we were helping test the feature. It was
   163  		// removed after git rev a23395d because we hadn't been
   164  		// using it for some time. Our VMs are so short-lived that
   165  		// the feature doesn't really help anyway. But if we ever
   166  		// find we want it again, this comment is here to point to
   167  		// code that might be useful to partially resurrect.
   168  		Scheduling: &compute.Scheduling{Preemptible: false},
   169  	}
   170  
   171  	// Container builders use the COS image, which defaults to logging to Cloud Logging.
   172  	// Permission is granted to this service account.
   173  	if hconf.IsContainer() && buildEnv.COSServiceAccount != "" {
   174  		instance.ServiceAccounts = []*compute.ServiceAccount{
   175  			{
   176  				Email:  buildEnv.COSServiceAccount,
   177  				Scopes: []string{compute.CloudPlatformScope},
   178  			},
   179  		}
   180  	}
   181  
   182  	addMeta := func(key, value string) {
   183  		instance.Metadata.Items = append(instance.Metadata.Items, &compute.MetadataItems{
   184  			Key:   key,
   185  			Value: &value,
   186  		})
   187  	}
   188  	// The buildlet-binary-url is the URL of the buildlet binary
   189  	// which the VMs are configured to download at boot and run.
   190  	// This lets us/ update the buildlet more easily than
   191  	// rebuilding the whole VM image.
   192  	addMeta("buildlet-binary-url", hconf.BuildletBinaryURL(buildenv.ByProjectID(opts.ProjectID)))
   193  	addMeta("buildlet-host-type", hostType)
   194  	if !opts.TLS.IsZero() {
   195  		addMeta("tls-cert", opts.TLS.CertPEM)
   196  		addMeta("tls-key", opts.TLS.KeyPEM)
   197  		addMeta("password", opts.TLS.Password())
   198  	}
   199  	if hconf.IsContainer() && hconf.CosArchitecture() == dashboard.CosArchAMD64 {
   200  		addMeta("gce-container-declaration", fmt.Sprintf(`spec:
   201    containers:
   202      - name: buildlet
   203        image: 'gcr.io/%s/%s'
   204        volumeMounts:
   205          - name: tmpfs-0
   206            mountPath: /workdir
   207        securityContext:
   208          privileged: true
   209        stdin: false
   210        tty: false
   211    restartPolicy: Always
   212    volumes:
   213      - name: tmpfs-0
   214        emptyDir:
   215          medium: Memory
   216  `, opts.ProjectID, hconf.ContainerImage))
   217  		addMeta("user-data", `#cloud-config
   218  
   219  runcmd:
   220  - sysctl -w kernel.core_pattern=core
   221  `)
   222  	} else if hconf.IsContainer() && hconf.CosArchitecture() == dashboard.CosArchARM64 {
   223  		addMeta("user-data", fmt.Sprintf(`#cloud-config
   224  
   225  write_files:
   226  - path: /etc/systemd/system/buildlet.service
   227    permissions: 0644
   228    owner: root:root
   229    content: |
   230      [Unit]
   231      Description=Start buildlet container
   232      Wants=gcr-online.target
   233      After=gcr-online.target
   234  
   235      [Service]
   236      Environment="HOME=/home/buildlet"
   237      ExecStart=/usr/bin/docker run --rm --name=buildlet --privileged -p 80:80 gcr.io/%s/%s
   238      ExecStop=/usr/bin/docker stop buildlet
   239      ExecStopPost=/usr/bin/docker rm buildlet
   240      RemainAfterExit=true
   241      Type=oneshot
   242  
   243  runcmd:
   244  - systemctl daemon-reload
   245  - systemctl start buildlet.service
   246  - sysctl -w kernel.core_pattern=core
   247  `, opts.ProjectID, hconf.ContainerImage))
   248  	}
   249  
   250  	if opts.DeleteIn > 0 {
   251  		// In case the VM gets away from us (generally: if the
   252  		// coordinator dies while a build is running), then we
   253  		// set this attribute of when it should be killed so
   254  		// we can kill it later when the coordinator is
   255  		// restarted. The cleanUpOldVMs goroutine loop handles
   256  		// that killing.
   257  		addMeta("delete-at", fmt.Sprint(time.Now().Add(opts.DeleteIn).Unix()))
   258  	}
   259  
   260  	for k, v := range opts.Meta {
   261  		addMeta(k, v)
   262  	}
   263  
   264  	apiGate()
   265  	op, err := computeService.Instances.Insert(projectID, zone, instance).Do()
   266  	if err != nil {
   267  		return nil, fmt.Errorf("Failed to create instance: %v", err)
   268  	}
   269  	condRun(opts.OnInstanceRequested)
   270  	createOp := op.Name
   271  
   272  	// Wait for instance create operation to succeed.
   273  OpLoop:
   274  	for {
   275  		time.Sleep(2 * time.Second)
   276  		apiGate()
   277  		op, err := computeService.ZoneOperations.Get(projectID, zone, createOp).Do()
   278  		if err != nil {
   279  			return nil, fmt.Errorf("failed to get op %s: %v", createOp, err)
   280  		}
   281  		switch op.Status {
   282  		case "PENDING", "RUNNING":
   283  			continue
   284  		case "DONE":
   285  			if op.Error != nil {
   286  				err := &GCEError{OpErrors: make([]*compute.OperationErrorErrors, len(op.Error.Errors))}
   287  				copy(err.OpErrors, op.Error.Errors)
   288  				return nil, err
   289  			}
   290  			break OpLoop
   291  		default:
   292  			return nil, fmt.Errorf("unknown create status %q: %+v", op.Status, op)
   293  		}
   294  	}
   295  	condRun(opts.OnInstanceCreated)
   296  
   297  	apiGate()
   298  	inst, err := computeService.Instances.Get(projectID, zone, instName).Do()
   299  	if err != nil {
   300  		return nil, fmt.Errorf("Error getting instance %s details after creation: %v", instName, err)
   301  	}
   302  
   303  	// Finds its internal and/or external IP addresses.
   304  	intIP, extIP := instanceIPs(inst)
   305  
   306  	// Wait for it to boot and its buildlet to come up.
   307  	var buildletURL string
   308  	var ipPort string
   309  	if !opts.TLS.IsZero() {
   310  		if extIP == "" {
   311  			return nil, errors.New("didn't find its external IP address")
   312  		}
   313  		buildletURL = "https://" + extIP
   314  		ipPort = extIP + ":443"
   315  	} else {
   316  		if intIP == "" {
   317  			return nil, errors.New("didn't find its internal IP address")
   318  		}
   319  		buildletURL = "http://" + intIP
   320  		ipPort = intIP + ":80"
   321  	}
   322  	if opts.OnGotInstanceInfo != nil {
   323  		opts.OnGotInstanceInfo(inst)
   324  	}
   325  	var closeFunc func()
   326  	if opts.UseIAPTunnel {
   327  		var localPort string
   328  		var err error
   329  		localPort, closeFunc, err = createIAPTunnel(ctx, inst)
   330  		if err != nil {
   331  			return nil, fmt.Errorf("creating IAP tunnel: %v", err)
   332  		}
   333  		buildletURL = "http://localhost:" + localPort
   334  		ipPort = "127.0.0.1:" + localPort
   335  	}
   336  	client, err := buildletClient(ctx, buildletURL, ipPort, &opts)
   337  	if err != nil {
   338  		return nil, err
   339  	}
   340  	if closeFunc != nil {
   341  		return &extraCloseClient{client, closeFunc}, nil
   342  	}
   343  	return client, nil
   344  }
   345  
   346  type extraCloseClient struct {
   347  	Client
   348  	close func()
   349  }
   350  
   351  func (e *extraCloseClient) Close() error {
   352  	defer e.close()
   353  	return e.Close()
   354  }
   355  
   356  func createIAPTunnel(ctx context.Context, inst *compute.Instance) (string, func(), error) {
   357  	// Allocate a local listening port.
   358  	ln, err := net.Listen("tcp", "localhost:0")
   359  	if err != nil {
   360  		return "", nil, err
   361  	}
   362  	localAddr := ln.Addr().(*net.TCPAddr)
   363  	ln.Close()
   364  	// Start the gcloud command. For some reason, when gcloud is run with a
   365  	// pipe for stdout, it doesn't log the success message, so we can only
   366  	// check for success empirically.
   367  	m := regexp.MustCompile(`/projects/([^/]+)/zones/([^/]+)`).FindStringSubmatch(inst.Zone)
   368  	if m == nil {
   369  		return "", nil, fmt.Errorf("unexpected inst.Zone: %q", inst.Zone)
   370  	}
   371  	project, zone := m[1], m[2]
   372  	tunnelCmd := exec.CommandContext(ctx,
   373  		"gcloud", "compute", "start-iap-tunnel", "--iap-tunnel-disable-connection-check",
   374  		"--project", project, "--zone", zone, inst.Name, "80", "--local-host-port", localAddr.String())
   375  
   376  	// hideWriter hides the underlying io.Writer from os/exec, bypassing the
   377  	// special case where os/exec will let a subprocess share the fd to an
   378  	// *os.File. Using hideWriter will result in goroutines that copy from a
   379  	// fresh pipe and write to the writer in the parent Go program.
   380  	// That guarantees that if the subprocess
   381  	// leaves background processes lying around, they will not keep lingering
   382  	// references to the parent Go program's stdout and stderr.
   383  	//
   384  	// Prior to this, it was common for ./debugnewvm | cat to never finish,
   385  	// because debugnewvm left some gcloud helper processes behind, and cat
   386  	// (or any other program) would never observe EOF on its input pipe.
   387  	// We now try to shut gcloud down more carefully with os.Interrupt below,
   388  	// but hideWriter guarantees that lingering processes won't hang
   389  	// pipelines.
   390  	type hideWriter struct{ io.Writer }
   391  	tunnelCmd.Stderr = hideWriter{os.Stderr}
   392  	tunnelCmd.Stdout = hideWriter{os.Stdout}
   393  
   394  	if err := tunnelCmd.Start(); err != nil {
   395  		return "", nil, err
   396  	}
   397  	// Start the process. Either it's going to fail to start after a bit, or
   398  	// it'll start listening on its port. Because we told it not to check the
   399  	// connection above, the connections won't be functional, but we can dial.
   400  	errc := make(chan error, 1)
   401  	go func() { errc <- tunnelCmd.Wait() }()
   402  	for start := time.Now(); time.Since(start) < 60*time.Second; time.Sleep(5 * time.Second) {
   403  		// Check if the server crashed.
   404  		select {
   405  		case err := <-errc:
   406  			return "", nil, err
   407  		default:
   408  		}
   409  		// Check if it's healthy.
   410  		conn, err := net.DialTCP("tcp", nil, localAddr)
   411  		if err == nil {
   412  			conn.Close()
   413  			kill := func() {
   414  				// gcloud compute start-iap-tunnel is a group of Python processes,
   415  				// so send an interrupt to try for an orderly shutdown of the process tree
   416  				// before killing the process outright.
   417  				tunnelCmd.Process.Signal(os.Interrupt)
   418  				time.Sleep(2 * time.Second)
   419  				tunnelCmd.Process.Kill()
   420  			}
   421  			return fmt.Sprint(localAddr.Port), kill, nil
   422  		}
   423  	}
   424  	return "", nil, fmt.Errorf("iap tunnel startup timed out")
   425  }
   426  
   427  type VM struct {
   428  	// Name is the name of the GCE VM instance.
   429  	// For example, it's of the form "mote-bradfitz-plan9-386-foo",
   430  	// and not "plan9-386-foo".
   431  	Name   string
   432  	IPPort string
   433  	TLS    KeyPair
   434  	Type   string // buildlet type
   435  }
   436  
   437  func instanceIPs(inst *compute.Instance) (intIP, extIP string) {
   438  	for _, iface := range inst.NetworkInterfaces {
   439  		if strings.HasPrefix(iface.NetworkIP, "10.") {
   440  			intIP = iface.NetworkIP
   441  		}
   442  		for _, accessConfig := range iface.AccessConfigs {
   443  			if accessConfig.Type == "ONE_TO_ONE_NAT" {
   444  				extIP = accessConfig.NatIP
   445  			}
   446  		}
   447  	}
   448  	return
   449  }
   450  
   451  var (
   452  	cosListMu     sync.Mutex
   453  	cosCachedTime time.Time
   454  	cosCache      = map[dashboard.CosArch]*cosCacheEntry{}
   455  )
   456  
   457  type cosCacheEntry struct {
   458  	cachedTime  time.Time
   459  	cachedImage string
   460  }
   461  
   462  // cosImage returns the GCP VM image name of the latest stable
   463  // Container-Optimized OS image. It caches results for 15 minutes.
   464  func cosImage(ctx context.Context, svc *compute.Service, arch dashboard.CosArch) (string, error) {
   465  	const cacheDuration = 15 * time.Minute
   466  	cosListMu.Lock()
   467  	defer cosListMu.Unlock()
   468  
   469  	cosQuery := func(a dashboard.CosArch) (string, error) {
   470  		imList, err := svc.Images.List("cos-cloud").Filter(fmt.Sprintf("(family eq %q)", string(arch))).Context(ctx).Do()
   471  		if err != nil {
   472  			return "", err
   473  		}
   474  		if imList.NextPageToken != "" {
   475  			return "", fmt.Errorf("too many images; pagination not supported")
   476  		}
   477  		ims := imList.Items
   478  		if len(ims) == 0 {
   479  			return "", errors.New("no image found")
   480  		}
   481  		sort.Slice(ims, func(i, j int) bool {
   482  			if ims[i].Deprecated == nil && ims[j].Deprecated != nil {
   483  				return true
   484  			}
   485  			return ims[i].CreationTimestamp > ims[j].CreationTimestamp
   486  		})
   487  		return ims[0].SelfLink, nil
   488  	}
   489  	c, ok := cosCache[arch]
   490  	if !ok {
   491  		image, err := cosQuery(arch)
   492  		if err != nil {
   493  			return "", err
   494  		}
   495  		cosCache[arch] = &cosCacheEntry{
   496  			cachedTime:  time.Now(),
   497  			cachedImage: image,
   498  		}
   499  		return image, nil
   500  	}
   501  	if c.cachedImage != "" && c.cachedTime.After(time.Now().Add(-cacheDuration)) {
   502  		return c.cachedImage, nil
   503  	}
   504  	image, err := cosQuery(arch)
   505  	if err != nil {
   506  		return "", err
   507  	}
   508  	c.cachedImage = image
   509  	c.cachedTime = time.Now()
   510  	return image, nil
   511  }