golang.org/x/build@v0.0.0-20240506185731-218518f32b70/buildlet/gce.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package buildlet 6 7 import ( 8 "bytes" 9 "context" 10 "encoding/json" 11 "errors" 12 "fmt" 13 "io" 14 "net" 15 "os" 16 "os/exec" 17 "regexp" 18 "sort" 19 "strings" 20 "sync" 21 "time" 22 23 "golang.org/x/build/buildenv" 24 "golang.org/x/build/dashboard" 25 "golang.org/x/oauth2" 26 "golang.org/x/oauth2/google" 27 "google.golang.org/api/compute/v1" 28 ) 29 30 // GCEGate optionally specifies a function to run before any GCE API call. 31 // It's intended to be used to bound QPS rate to GCE. 32 var GCEGate func() 33 34 func apiGate() { 35 if GCEGate != nil { 36 GCEGate() 37 } 38 } 39 40 // ErrQuotaExceeded matches errors.Is when VM creation fails with a 41 // quota error. Currently, it only supports GCE quota errors. 42 var ErrQuotaExceeded = errors.New("quota exceeded") 43 44 type GCEError struct { 45 OpErrors []*compute.OperationErrorErrors 46 } 47 48 func (q *GCEError) Error() string { 49 var buf bytes.Buffer 50 fmt.Fprintf(&buf, "%d GCE operation errors: ", len(q.OpErrors)) 51 for i, e := range q.OpErrors { 52 if i != 0 { 53 buf.WriteString("; ") 54 } 55 b, err := json.Marshal(e) 56 if err != nil { 57 fmt.Fprintf(&buf, "json.Marshal(OpErrors[%d]): %v", i, err) 58 continue 59 } 60 buf.Write(b) 61 } 62 return buf.String() 63 } 64 65 func (q *GCEError) Is(target error) bool { 66 for _, err := range q.OpErrors { 67 if target == ErrQuotaExceeded && err.Code == "QUOTA_EXCEEDED" { 68 return true 69 } 70 } 71 return false 72 } 73 74 // StartNewVM boots a new VM on GCE and returns a buildlet client 75 // configured to speak to it. 76 func StartNewVM(creds *google.Credentials, buildEnv *buildenv.Environment, instName, hostType string, opts VMOpts) (Client, error) { 77 ctx := context.TODO() 78 computeService, _ := compute.New(oauth2.NewClient(ctx, creds.TokenSource)) 79 80 if opts.Description == "" { 81 opts.Description = fmt.Sprintf("Go Builder for %s", hostType) 82 } 83 if opts.ProjectID == "" { 84 opts.ProjectID = buildEnv.ProjectName 85 } 86 if opts.Zone == "" { 87 opts.Zone = buildEnv.RandomVMZone() 88 } 89 zone := opts.Zone 90 if opts.DeleteIn == 0 { 91 opts.DeleteIn = 30 * time.Minute 92 } 93 94 hconf, ok := dashboard.Hosts[hostType] 95 if !ok { 96 return nil, fmt.Errorf("invalid host type %q", hostType) 97 } 98 if !hconf.IsVM() && !hconf.IsContainer() { 99 return nil, fmt.Errorf("host %q is type %q; want either a VM or container type", hostType, hconf.PoolName()) 100 } 101 102 projectID := opts.ProjectID 103 if projectID == "" { 104 return nil, errors.New("buildlet: missing required ProjectID option") 105 } 106 107 prefix := "https://www.googleapis.com/compute/v1/projects/" + projectID 108 machType := prefix + "/zones/" + zone + "/machineTypes/" + hconf.MachineType() 109 diskType := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/zones/" + zone + "/diskTypes/pd-ssd" 110 if hconf.RegularDisk { 111 diskType = "" // a spinning disk 112 } 113 114 srcImage := "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + hconf.VMImage 115 minCPU := hconf.MinCPUPlatform 116 if hconf.IsContainer() { 117 if hconf.NestedVirt { 118 minCPU = "Intel Cascade Lake" // n2 vms (which support NestedVirtualization) are either Ice Lake or Cascade Lake. 119 } 120 if vm := hconf.ContainerVMImage(); vm != "" { 121 srcImage = "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + vm 122 } else { 123 var err error 124 srcImage, err = cosImage(ctx, computeService, hconf.CosArchitecture()) 125 if err != nil { 126 return nil, fmt.Errorf("error find Container-Optimized OS image: %v", err) 127 } 128 } 129 } 130 131 instance := &compute.Instance{ 132 Name: instName, 133 Description: opts.Description, 134 MachineType: machType, 135 MinCpuPlatform: minCPU, 136 Disks: []*compute.AttachedDisk{ 137 { 138 AutoDelete: true, 139 Boot: true, 140 Type: "PERSISTENT", 141 InitializeParams: &compute.AttachedDiskInitializeParams{ 142 DiskName: instName, 143 SourceImage: srcImage, 144 DiskType: diskType, 145 DiskSizeGb: opts.DiskSizeGB, 146 }, 147 }, 148 }, 149 Tags: &compute.Tags{ 150 // Warning: do NOT list "http-server" or "allow-ssh" (our 151 // project's custom tag to allow ssh access) here; the 152 // buildlet provides full remote code execution. 153 // The https-server is authenticated, though. 154 Items: []string{"https-server"}, 155 }, 156 Metadata: &compute.Metadata{}, 157 NetworkInterfaces: []*compute.NetworkInterface{{ 158 Network: prefix + "/global/networks/default-vpc", 159 }}, 160 161 // Prior to git rev 1b1e086fd, we used preemptible 162 // instances, as we were helping test the feature. It was 163 // removed after git rev a23395d because we hadn't been 164 // using it for some time. Our VMs are so short-lived that 165 // the feature doesn't really help anyway. But if we ever 166 // find we want it again, this comment is here to point to 167 // code that might be useful to partially resurrect. 168 Scheduling: &compute.Scheduling{Preemptible: false}, 169 } 170 171 // Container builders use the COS image, which defaults to logging to Cloud Logging. 172 // Permission is granted to this service account. 173 if hconf.IsContainer() && buildEnv.COSServiceAccount != "" { 174 instance.ServiceAccounts = []*compute.ServiceAccount{ 175 { 176 Email: buildEnv.COSServiceAccount, 177 Scopes: []string{compute.CloudPlatformScope}, 178 }, 179 } 180 } 181 182 addMeta := func(key, value string) { 183 instance.Metadata.Items = append(instance.Metadata.Items, &compute.MetadataItems{ 184 Key: key, 185 Value: &value, 186 }) 187 } 188 // The buildlet-binary-url is the URL of the buildlet binary 189 // which the VMs are configured to download at boot and run. 190 // This lets us/ update the buildlet more easily than 191 // rebuilding the whole VM image. 192 addMeta("buildlet-binary-url", hconf.BuildletBinaryURL(buildenv.ByProjectID(opts.ProjectID))) 193 addMeta("buildlet-host-type", hostType) 194 if !opts.TLS.IsZero() { 195 addMeta("tls-cert", opts.TLS.CertPEM) 196 addMeta("tls-key", opts.TLS.KeyPEM) 197 addMeta("password", opts.TLS.Password()) 198 } 199 if hconf.IsContainer() && hconf.CosArchitecture() == dashboard.CosArchAMD64 { 200 addMeta("gce-container-declaration", fmt.Sprintf(`spec: 201 containers: 202 - name: buildlet 203 image: 'gcr.io/%s/%s' 204 volumeMounts: 205 - name: tmpfs-0 206 mountPath: /workdir 207 securityContext: 208 privileged: true 209 stdin: false 210 tty: false 211 restartPolicy: Always 212 volumes: 213 - name: tmpfs-0 214 emptyDir: 215 medium: Memory 216 `, opts.ProjectID, hconf.ContainerImage)) 217 addMeta("user-data", `#cloud-config 218 219 runcmd: 220 - sysctl -w kernel.core_pattern=core 221 `) 222 } else if hconf.IsContainer() && hconf.CosArchitecture() == dashboard.CosArchARM64 { 223 addMeta("user-data", fmt.Sprintf(`#cloud-config 224 225 write_files: 226 - path: /etc/systemd/system/buildlet.service 227 permissions: 0644 228 owner: root:root 229 content: | 230 [Unit] 231 Description=Start buildlet container 232 Wants=gcr-online.target 233 After=gcr-online.target 234 235 [Service] 236 Environment="HOME=/home/buildlet" 237 ExecStart=/usr/bin/docker run --rm --name=buildlet --privileged -p 80:80 gcr.io/%s/%s 238 ExecStop=/usr/bin/docker stop buildlet 239 ExecStopPost=/usr/bin/docker rm buildlet 240 RemainAfterExit=true 241 Type=oneshot 242 243 runcmd: 244 - systemctl daemon-reload 245 - systemctl start buildlet.service 246 - sysctl -w kernel.core_pattern=core 247 `, opts.ProjectID, hconf.ContainerImage)) 248 } 249 250 if opts.DeleteIn > 0 { 251 // In case the VM gets away from us (generally: if the 252 // coordinator dies while a build is running), then we 253 // set this attribute of when it should be killed so 254 // we can kill it later when the coordinator is 255 // restarted. The cleanUpOldVMs goroutine loop handles 256 // that killing. 257 addMeta("delete-at", fmt.Sprint(time.Now().Add(opts.DeleteIn).Unix())) 258 } 259 260 for k, v := range opts.Meta { 261 addMeta(k, v) 262 } 263 264 apiGate() 265 op, err := computeService.Instances.Insert(projectID, zone, instance).Do() 266 if err != nil { 267 return nil, fmt.Errorf("Failed to create instance: %v", err) 268 } 269 condRun(opts.OnInstanceRequested) 270 createOp := op.Name 271 272 // Wait for instance create operation to succeed. 273 OpLoop: 274 for { 275 time.Sleep(2 * time.Second) 276 apiGate() 277 op, err := computeService.ZoneOperations.Get(projectID, zone, createOp).Do() 278 if err != nil { 279 return nil, fmt.Errorf("failed to get op %s: %v", createOp, err) 280 } 281 switch op.Status { 282 case "PENDING", "RUNNING": 283 continue 284 case "DONE": 285 if op.Error != nil { 286 err := &GCEError{OpErrors: make([]*compute.OperationErrorErrors, len(op.Error.Errors))} 287 copy(err.OpErrors, op.Error.Errors) 288 return nil, err 289 } 290 break OpLoop 291 default: 292 return nil, fmt.Errorf("unknown create status %q: %+v", op.Status, op) 293 } 294 } 295 condRun(opts.OnInstanceCreated) 296 297 apiGate() 298 inst, err := computeService.Instances.Get(projectID, zone, instName).Do() 299 if err != nil { 300 return nil, fmt.Errorf("Error getting instance %s details after creation: %v", instName, err) 301 } 302 303 // Finds its internal and/or external IP addresses. 304 intIP, extIP := instanceIPs(inst) 305 306 // Wait for it to boot and its buildlet to come up. 307 var buildletURL string 308 var ipPort string 309 if !opts.TLS.IsZero() { 310 if extIP == "" { 311 return nil, errors.New("didn't find its external IP address") 312 } 313 buildletURL = "https://" + extIP 314 ipPort = extIP + ":443" 315 } else { 316 if intIP == "" { 317 return nil, errors.New("didn't find its internal IP address") 318 } 319 buildletURL = "http://" + intIP 320 ipPort = intIP + ":80" 321 } 322 if opts.OnGotInstanceInfo != nil { 323 opts.OnGotInstanceInfo(inst) 324 } 325 var closeFunc func() 326 if opts.UseIAPTunnel { 327 var localPort string 328 var err error 329 localPort, closeFunc, err = createIAPTunnel(ctx, inst) 330 if err != nil { 331 return nil, fmt.Errorf("creating IAP tunnel: %v", err) 332 } 333 buildletURL = "http://localhost:" + localPort 334 ipPort = "127.0.0.1:" + localPort 335 } 336 client, err := buildletClient(ctx, buildletURL, ipPort, &opts) 337 if err != nil { 338 return nil, err 339 } 340 if closeFunc != nil { 341 return &extraCloseClient{client, closeFunc}, nil 342 } 343 return client, nil 344 } 345 346 type extraCloseClient struct { 347 Client 348 close func() 349 } 350 351 func (e *extraCloseClient) Close() error { 352 defer e.close() 353 return e.Close() 354 } 355 356 func createIAPTunnel(ctx context.Context, inst *compute.Instance) (string, func(), error) { 357 // Allocate a local listening port. 358 ln, err := net.Listen("tcp", "localhost:0") 359 if err != nil { 360 return "", nil, err 361 } 362 localAddr := ln.Addr().(*net.TCPAddr) 363 ln.Close() 364 // Start the gcloud command. For some reason, when gcloud is run with a 365 // pipe for stdout, it doesn't log the success message, so we can only 366 // check for success empirically. 367 m := regexp.MustCompile(`/projects/([^/]+)/zones/([^/]+)`).FindStringSubmatch(inst.Zone) 368 if m == nil { 369 return "", nil, fmt.Errorf("unexpected inst.Zone: %q", inst.Zone) 370 } 371 project, zone := m[1], m[2] 372 tunnelCmd := exec.CommandContext(ctx, 373 "gcloud", "compute", "start-iap-tunnel", "--iap-tunnel-disable-connection-check", 374 "--project", project, "--zone", zone, inst.Name, "80", "--local-host-port", localAddr.String()) 375 376 // hideWriter hides the underlying io.Writer from os/exec, bypassing the 377 // special case where os/exec will let a subprocess share the fd to an 378 // *os.File. Using hideWriter will result in goroutines that copy from a 379 // fresh pipe and write to the writer in the parent Go program. 380 // That guarantees that if the subprocess 381 // leaves background processes lying around, they will not keep lingering 382 // references to the parent Go program's stdout and stderr. 383 // 384 // Prior to this, it was common for ./debugnewvm | cat to never finish, 385 // because debugnewvm left some gcloud helper processes behind, and cat 386 // (or any other program) would never observe EOF on its input pipe. 387 // We now try to shut gcloud down more carefully with os.Interrupt below, 388 // but hideWriter guarantees that lingering processes won't hang 389 // pipelines. 390 type hideWriter struct{ io.Writer } 391 tunnelCmd.Stderr = hideWriter{os.Stderr} 392 tunnelCmd.Stdout = hideWriter{os.Stdout} 393 394 if err := tunnelCmd.Start(); err != nil { 395 return "", nil, err 396 } 397 // Start the process. Either it's going to fail to start after a bit, or 398 // it'll start listening on its port. Because we told it not to check the 399 // connection above, the connections won't be functional, but we can dial. 400 errc := make(chan error, 1) 401 go func() { errc <- tunnelCmd.Wait() }() 402 for start := time.Now(); time.Since(start) < 60*time.Second; time.Sleep(5 * time.Second) { 403 // Check if the server crashed. 404 select { 405 case err := <-errc: 406 return "", nil, err 407 default: 408 } 409 // Check if it's healthy. 410 conn, err := net.DialTCP("tcp", nil, localAddr) 411 if err == nil { 412 conn.Close() 413 kill := func() { 414 // gcloud compute start-iap-tunnel is a group of Python processes, 415 // so send an interrupt to try for an orderly shutdown of the process tree 416 // before killing the process outright. 417 tunnelCmd.Process.Signal(os.Interrupt) 418 time.Sleep(2 * time.Second) 419 tunnelCmd.Process.Kill() 420 } 421 return fmt.Sprint(localAddr.Port), kill, nil 422 } 423 } 424 return "", nil, fmt.Errorf("iap tunnel startup timed out") 425 } 426 427 type VM struct { 428 // Name is the name of the GCE VM instance. 429 // For example, it's of the form "mote-bradfitz-plan9-386-foo", 430 // and not "plan9-386-foo". 431 Name string 432 IPPort string 433 TLS KeyPair 434 Type string // buildlet type 435 } 436 437 func instanceIPs(inst *compute.Instance) (intIP, extIP string) { 438 for _, iface := range inst.NetworkInterfaces { 439 if strings.HasPrefix(iface.NetworkIP, "10.") { 440 intIP = iface.NetworkIP 441 } 442 for _, accessConfig := range iface.AccessConfigs { 443 if accessConfig.Type == "ONE_TO_ONE_NAT" { 444 extIP = accessConfig.NatIP 445 } 446 } 447 } 448 return 449 } 450 451 var ( 452 cosListMu sync.Mutex 453 cosCachedTime time.Time 454 cosCache = map[dashboard.CosArch]*cosCacheEntry{} 455 ) 456 457 type cosCacheEntry struct { 458 cachedTime time.Time 459 cachedImage string 460 } 461 462 // cosImage returns the GCP VM image name of the latest stable 463 // Container-Optimized OS image. It caches results for 15 minutes. 464 func cosImage(ctx context.Context, svc *compute.Service, arch dashboard.CosArch) (string, error) { 465 const cacheDuration = 15 * time.Minute 466 cosListMu.Lock() 467 defer cosListMu.Unlock() 468 469 cosQuery := func(a dashboard.CosArch) (string, error) { 470 imList, err := svc.Images.List("cos-cloud").Filter(fmt.Sprintf("(family eq %q)", string(arch))).Context(ctx).Do() 471 if err != nil { 472 return "", err 473 } 474 if imList.NextPageToken != "" { 475 return "", fmt.Errorf("too many images; pagination not supported") 476 } 477 ims := imList.Items 478 if len(ims) == 0 { 479 return "", errors.New("no image found") 480 } 481 sort.Slice(ims, func(i, j int) bool { 482 if ims[i].Deprecated == nil && ims[j].Deprecated != nil { 483 return true 484 } 485 return ims[i].CreationTimestamp > ims[j].CreationTimestamp 486 }) 487 return ims[0].SelfLink, nil 488 } 489 c, ok := cosCache[arch] 490 if !ok { 491 image, err := cosQuery(arch) 492 if err != nil { 493 return "", err 494 } 495 cosCache[arch] = &cosCacheEntry{ 496 cachedTime: time.Now(), 497 cachedImage: image, 498 } 499 return image, nil 500 } 501 if c.cachedImage != "" && c.cachedTime.After(time.Now().Add(-cacheDuration)) { 502 return c.cachedImage, nil 503 } 504 image, err := cosQuery(arch) 505 if err != nil { 506 return "", err 507 } 508 c.cachedImage = image 509 c.cachedTime = time.Now() 510 return image, nil 511 }