golang.org/x/build@v0.0.0-20240506185731-218518f32b70/internal/coordinator/pool/gce.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build linux || darwin 6 7 // Code interacting with Google Compute Engine (GCE) and 8 // a GCE implementation of the BuildletPool interface. 9 10 package pool 11 12 import ( 13 "context" 14 "encoding/json" 15 "errors" 16 "fmt" 17 "io" 18 "log" 19 "net/http" 20 "path" 21 "sort" 22 "strconv" 23 "strings" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 "cloud.google.com/go/compute/metadata" 29 "cloud.google.com/go/datastore" 30 "cloud.google.com/go/errorreporting" 31 "cloud.google.com/go/storage" 32 "golang.org/x/build/buildenv" 33 "golang.org/x/build/buildlet" 34 "golang.org/x/build/dashboard" 35 "golang.org/x/build/gerrit" 36 "golang.org/x/build/internal/buildgo" 37 "golang.org/x/build/internal/buildstats" 38 "golang.org/x/build/internal/coordinator/pool/queue" 39 "golang.org/x/build/internal/lru" 40 "golang.org/x/build/internal/secret" 41 "golang.org/x/build/internal/spanlog" 42 "golang.org/x/oauth2" 43 "golang.org/x/oauth2/google" 44 "google.golang.org/api/compute/v1" 45 "google.golang.org/api/googleapi" 46 ) 47 48 func init() { 49 buildlet.GCEGate = gceAPIGate 50 } 51 52 // apiCallTicker ticks regularly, preventing us from accidentally making 53 // GCE API calls too quickly. Our quota is 20 QPS, but we temporarily 54 // limit ourselves to less than that. 55 var apiCallTicker = time.NewTicker(time.Second / 10) 56 57 // Separate rate limit for deletions, which are more important than other 58 // actions, especially at server startup. 59 var deletionTicker = time.NewTicker(time.Second / 10) 60 61 func gceAPIGate() { 62 <-apiCallTicker.C 63 } 64 65 func deletionAPIGate() { 66 <-deletionTicker.C 67 } 68 69 // Initialized by InitGCE: 70 // 71 // TODO(golang.org/issue/38337): These should be moved into a struct as 72 // part of the effort to reduce package level variables. 73 var ( 74 buildEnv *buildenv.Environment 75 76 // dsClient is a datastore client for the build project (symbolic-datum-552), where build progress is stored. 77 dsClient *datastore.Client 78 // goDSClient is a datastore client for golang-org, where build status is stored. 79 goDSClient *datastore.Client 80 // oAuthHTTPClient is the OAuth2 HTTP client used to make API calls to Google Cloud APIs. 81 oAuthHTTPClient *http.Client 82 computeService *compute.Service 83 gcpCreds *google.Credentials 84 errTryDeps error // non-nil if try bots are disabled 85 gerritClient *gerrit.Client 86 storageClient *storage.Client 87 inStaging bool // are we running in the staging project? (named -dev) 88 errorsClient *errorreporting.Client // Stackdriver errors client 89 gkeNodeHostname string 90 91 // values created due to separating the buildlet pools into a separate package 92 gceMode string 93 basePinErr *atomic.Value 94 isRemoteBuildlet IsRemoteBuildletFunc 95 ) 96 97 // InitGCE initializes the GCE buildlet pool. 98 func InitGCE(sc *secret.Client, basePin *atomic.Value, fn IsRemoteBuildletFunc, buildEnvName, mode string) error { 99 gceMode = mode 100 basePinErr = basePin 101 isRemoteBuildlet = fn 102 103 ctx := context.Background() 104 var err error 105 106 // If the coordinator is running on a GCE instance and a 107 // buildEnv was not specified with the env flag, set the 108 // buildEnvName to the project ID 109 if buildEnvName == "" { 110 if mode == "dev" { 111 buildEnvName = "dev" 112 } else if metadata.OnGCE() { 113 buildEnvName, err = metadata.ProjectID() 114 if err != nil { 115 log.Fatalf("metadata.ProjectID: %v", err) 116 } 117 } 118 } 119 120 buildEnv = buildenv.ByProjectID(buildEnvName) 121 inStaging = buildEnv == buildenv.Staging 122 123 // If running on GCE, override the zone and static IP, and check service account permissions. 124 if metadata.OnGCE() { 125 gkeNodeHostname, err = metadata.Get("instance/hostname") 126 if err != nil { 127 return fmt.Errorf("failed to get current instance hostname: %v", err) 128 } 129 130 if len(buildEnv.VMZones) == 0 || buildEnv.VMRegion == "" { 131 projectZone, err := metadata.Get("instance/zone") 132 if err != nil || projectZone == "" { 133 return fmt.Errorf("failed to get current GCE zone: %v", err) 134 } 135 // Convert the zone from "projects/1234/zones/us-central1-a" to "us-central1-a". 136 projectZone = path.Base(projectZone) 137 if len(buildEnv.VMZones) == 0 { 138 buildEnv.VMZones = []string{projectZone} 139 } 140 if buildEnv.VMRegion == "" { 141 buildEnv.VMRegion = strings.Join(strings.Split(projectZone, "-")[:2], "-") 142 } 143 } 144 145 if buildEnv.StaticIP == "" { 146 buildEnv.StaticIP, err = metadata.ExternalIP() 147 if err != nil { 148 return fmt.Errorf("ExternalIP: %v", err) 149 } 150 } 151 152 if !hasComputeScope() { 153 return errors.New("coordinator is not running with access to read and write Compute resources. VM support disabled") 154 } 155 } 156 157 cfgDump, _ := json.MarshalIndent(buildEnv, "", " ") 158 log.Printf("Loaded configuration %q for project %q:\n%s", buildEnvName, buildEnv.ProjectName, cfgDump) 159 160 if mode != "dev" { 161 storageClient, err = storage.NewClient(ctx) 162 if err != nil { 163 log.Fatalf("storage.NewClient: %v", err) 164 } 165 } 166 167 dsClient, err = datastore.NewClient(ctx, buildEnv.ProjectName) 168 if err != nil { 169 if mode == "dev" { 170 log.Printf("Error creating datastore client for %q: %v", buildEnv.ProjectName, err) 171 } else { 172 log.Fatalf("Error creating datastore client for %q: %v", buildEnv.ProjectName, err) 173 } 174 } 175 goDSClient, err = datastore.NewClient(ctx, buildEnv.GoProjectName) 176 if err != nil { 177 if mode == "dev" { 178 log.Printf("Error creating datastore client for %q: %v", buildEnv.GoProjectName, err) 179 } else { 180 log.Fatalf("Error creating datastore client for %q: %v", buildEnv.GoProjectName, err) 181 } 182 } 183 184 // don't send dev errors to Stackdriver. 185 if mode != "dev" { 186 errorsClient, err = errorreporting.NewClient(ctx, buildEnv.ProjectName, errorreporting.Config{ 187 ServiceName: "coordinator", 188 }) 189 if err != nil { 190 // don't exit, we still want to run coordinator 191 log.Printf("Error creating errors client: %v", err) 192 } 193 } 194 195 gcpCreds, err = buildEnv.Credentials(ctx) 196 if err != nil { 197 if mode == "dev" { 198 // don't try to do anything else with GCE, as it will likely fail 199 return nil 200 } 201 log.Fatalf("failed to get a token source: %v", err) 202 } 203 oAuthHTTPClient = oauth2.NewClient(ctx, gcpCreds.TokenSource) 204 computeService, _ = compute.New(oAuthHTTPClient) 205 errTryDeps = checkTryBuildDeps(ctx, sc) 206 if errTryDeps != nil { 207 log.Printf("TryBot builders disabled due to error: %v", errTryDeps) 208 } else { 209 log.Printf("TryBot builders enabled.") 210 } 211 212 if mode != "dev" && metadata.OnGCE() && (buildEnv == buildenv.Production || buildEnv == buildenv.Staging) { 213 go syncBuildStatsLoop(buildEnv) 214 go gcePool.pollQuotaLoop() 215 go createBasepinDisks(ctx) 216 } 217 218 return nil 219 } 220 221 // StorageClient retrieves the GCE storage client. 222 func StorageClient(ctx context.Context) (*storage.Client, error) { 223 sc, err := storage.NewClient(ctx) 224 if err != nil { 225 return nil, fmt.Errorf("storage.NewClient: %w", err) 226 } 227 return sc, nil 228 } 229 230 // TODO(golang.org/issue/38337): These should be moved into a struct as 231 // part of the effort to reduce package level variables. 232 233 // GCEConfiguration manages and contains all of the GCE configuration. 234 type GCEConfiguration struct{} 235 236 // NewGCEConfiguration creates a new GCEConfiguration. 237 func NewGCEConfiguration() *GCEConfiguration { return &GCEConfiguration{} } 238 239 // StorageClient retrieves the GCE storage client. 240 func (c *GCEConfiguration) StorageClient() *storage.Client { 241 return storageClient 242 } 243 244 // BuildEnv retrieves the GCE build env. 245 func (c *GCEConfiguration) BuildEnv() *buildenv.Environment { 246 return buildEnv 247 } 248 249 // SetBuildEnv sets the GCE build env. This is primarily reserved for 250 // testing purposes. 251 func (c *GCEConfiguration) SetBuildEnv(b *buildenv.Environment) { 252 buildEnv = b 253 } 254 255 // BuildletPool retrieves the GCE buildlet pool. 256 func (c *GCEConfiguration) BuildletPool() *GCEBuildlet { 257 return gcePool 258 } 259 260 // InStaging returns a boolean denoting if the environment is staging. 261 func (c *GCEConfiguration) InStaging() bool { 262 return inStaging 263 } 264 265 // GerritClient retrieves a gerrit client. 266 func (c *GCEConfiguration) GerritClient() *gerrit.Client { 267 return gerritClient 268 } 269 270 // GKENodeHostname retrieves the GKE node hostname. 271 func (c *GCEConfiguration) GKENodeHostname() string { 272 return gkeNodeHostname 273 } 274 275 // DSClient retrieves the datastore client. 276 func (c *GCEConfiguration) DSClient() *datastore.Client { 277 return dsClient 278 } 279 280 // GoDSClient retrieves the datastore client for golang.org project. 281 func (c *GCEConfiguration) GoDSClient() *datastore.Client { 282 return goDSClient 283 } 284 285 // TryDepsErr retrieves any Trybot dependency error. 286 func (c *GCEConfiguration) TryDepsErr() error { 287 return errTryDeps 288 } 289 290 // ErrorsClient retrieves the stackdriver errors client. 291 func (c *GCEConfiguration) ErrorsClient() *errorreporting.Client { 292 return errorsClient 293 } 294 295 // OAuthHTTPClient retrieves an OAuth2 HTTP client used to make API calls to GCP. 296 func (c *GCEConfiguration) OAuthHTTPClient() *http.Client { 297 return oAuthHTTPClient 298 } 299 300 // GCPCredentials retrieves the GCP credentials. 301 func (c *GCEConfiguration) GCPCredentials() *google.Credentials { 302 return gcpCreds 303 } 304 305 func checkTryBuildDeps(ctx context.Context, sc *secret.Client) error { 306 if !hasStorageScope() { 307 return errors.New("coordinator's GCE instance lacks the storage service scope") 308 } 309 if gceMode == "dev" { 310 return errors.New("running in dev mode") 311 } 312 wr := storageClient.Bucket(buildEnv.LogBucket).Object("hello.txt").NewWriter(context.Background()) 313 fmt.Fprintf(wr, "Hello, world! Coordinator start-up at %v", time.Now()) 314 if err := wr.Close(); err != nil { 315 return fmt.Errorf("test write of a GCS object to bucket %q failed: %v", buildEnv.LogBucket, err) 316 } 317 if inStaging { 318 // Don't expect to write to Gerrit in staging mode. 319 gerritClient = gerrit.NewClient("https://go-review.googlesource.com", gerrit.NoAuth) 320 } else { 321 ctxSec, cancel := context.WithTimeout(ctx, 10*time.Second) 322 defer cancel() 323 324 gobotPass, err := sc.Retrieve(ctxSec, secret.NameGobotPassword) 325 if err != nil { 326 return fmt.Errorf("failed to get project metadata 'gobot-password': %v", err) 327 } 328 gerritClient = gerrit.NewClient("https://go-review.googlesource.com", 329 gerrit.BasicAuth("git-gobot.golang.org", strings.TrimSpace(string(gobotPass)))) 330 } 331 332 return nil 333 } 334 335 var gcePool = &GCEBuildlet{ 336 c2cpuQueue: queue.NewQuota(), 337 cpuQueue: queue.NewQuota(), 338 instQueue: queue.NewQuota(), 339 n2cpuQueue: queue.NewQuota(), 340 n2dcpuQueue: queue.NewQuota(), 341 t2acpuQueue: queue.NewQuota(), 342 } 343 344 var _ Buildlet = (*GCEBuildlet)(nil) 345 346 // GCEBuildlet manages a pool of GCE buildlets. 347 type GCEBuildlet struct { 348 mu sync.Mutex // guards all following 349 350 disabled bool 351 352 // CPU quota usage & limits. pollQuota updates quotas periodically. 353 // The values recorded here reflect the updates as well as our own 354 // bookkeeping of instances as they are created and destroyed. 355 c2cpuQueue *queue.Quota 356 cpuQueue *queue.Quota 357 instQueue *queue.Quota 358 n2cpuQueue *queue.Quota 359 n2dcpuQueue *queue.Quota 360 t2acpuQueue *queue.Quota 361 inst map[string]time.Time // GCE VM instance name -> creationTime 362 } 363 364 func (p *GCEBuildlet) pollQuotaLoop() { 365 for { 366 p.pollQuota() 367 time.Sleep(time.Minute) 368 } 369 } 370 371 // pollQuota updates cpu usage and limits from the compute API. 372 func (p *GCEBuildlet) pollQuota() { 373 gceAPIGate() 374 reg, err := computeService.Regions.Get(buildEnv.ProjectName, buildEnv.VMRegion).Do() 375 if err != nil { 376 log.Printf("Failed to get quota for %s/%s: %v", buildEnv.ProjectName, buildEnv.VMRegion, err) 377 return 378 } 379 380 if err := p.updateUntrackedQuota(); err != nil { 381 log.Printf("Failed to update quota used by other instances: %q", err) 382 } 383 for _, quota := range reg.Quotas { 384 switch quota.Metric { 385 case "CPUS": 386 p.cpuQueue.UpdateLimit(int(quota.Limit)) 387 case "C2_CPUS": 388 p.c2cpuQueue.UpdateLimit(int(quota.Limit)) 389 case "N2_CPUS": 390 p.n2cpuQueue.UpdateLimit(int(quota.Limit)) 391 case "N2D_CPUS": 392 p.n2dcpuQueue.UpdateLimit(int(quota.Limit)) 393 case "T2A_CPUS": 394 p.t2acpuQueue.UpdateLimit(int(quota.Limit)) 395 case "INSTANCES": 396 p.instQueue.UpdateLimit(int(quota.Limit)) 397 } 398 } 399 } 400 401 func (p *GCEBuildlet) QuotaStats() map[string]*queue.QuotaStats { 402 return map[string]*queue.QuotaStats{ 403 "gce-cpu": p.cpuQueue.ToExported(), 404 "gce-c2-cpu": p.c2cpuQueue.ToExported(), 405 "gce-n2-cpu": p.n2cpuQueue.ToExported(), 406 "gce-n2d-cpu": p.n2dcpuQueue.ToExported(), 407 "gce-t2a-cpu": p.t2acpuQueue.ToExported(), 408 "gce-instances": p.instQueue.ToExported(), 409 } 410 } 411 412 func (p *GCEBuildlet) updateUntrackedQuota() error { 413 untrackedQuotas := make(map[*queue.Quota]int) 414 for _, zone := range buildEnv.VMZones { 415 gceAPIGate() 416 err := computeService.Instances.List(buildEnv.ProjectName, zone).Pages(context.Background(), func(list *compute.InstanceList) error { 417 for _, inst := range list.Items { 418 if isBuildlet(inst.Name) { 419 continue 420 } 421 untrackedQuotas[p.queueForMachineType(inst.MachineType)] += GCENumCPU(inst.MachineType) 422 } 423 if list.NextPageToken != "" { 424 // Don't use all our quota flipping through pages. 425 gceAPIGate() 426 } 427 return nil 428 }) 429 if err != nil { 430 return err 431 } 432 } 433 for quota, num := range untrackedQuotas { 434 quota.UpdateUntracked(num) 435 } 436 return nil 437 } 438 439 // SetEnabled marks the buildlet pool as enabled. 440 func (p *GCEBuildlet) SetEnabled(enabled bool) { 441 p.mu.Lock() 442 defer p.mu.Unlock() 443 p.disabled = !enabled 444 } 445 446 // GetBuildlet retrieves a buildlet client for an available buildlet. 447 func (p *GCEBuildlet) GetBuildlet(ctx context.Context, hostType string, lg Logger, si *queue.SchedItem) (bc buildlet.Client, err error) { 448 if p.disabled { 449 return nil, errors.New("pool disabled by configuration") 450 } 451 hconf, ok := dashboard.Hosts[hostType] 452 if !ok { 453 return nil, fmt.Errorf("gcepool: unknown host type %q", hostType) 454 } 455 qsp := lg.CreateSpan("awaiting_gce_quota") 456 instItem := p.instQueue.Enqueue(1, si) 457 if err := instItem.Await(ctx); err != nil { 458 return nil, err 459 } 460 cpuItem := p.queueForMachineType(hconf.MachineType()).Enqueue(GCENumCPU(hconf.MachineType()), si) 461 err = cpuItem.Await(ctx) 462 qsp.Done(err) 463 if err != nil { 464 // return unused quota 465 instItem.ReturnQuota() 466 return nil, err 467 } 468 469 instName := instanceName(hostType, 7) 470 instName = strings.Replace(instName, "_", "-", -1) // Issue 22905; can't use underscores in GCE VMs 471 p.setInstanceUsed(instName, true) 472 473 gceBuildletSpan := lg.CreateSpan("create_gce_buildlet", instName) 474 defer func() { gceBuildletSpan.Done(err) }() 475 476 var ( 477 needDelete bool 478 createSpan = lg.CreateSpan("create_gce_instance", instName) 479 waitBuildlet spanlog.Span // made after create is done 480 curSpan = createSpan // either instSpan or waitBuildlet 481 ) 482 483 zone := buildEnv.RandomVMZone() 484 cleanup := func() { 485 if needDelete { 486 deleteVM(zone, instName) 487 } 488 instItem.ReturnQuota() 489 cpuItem.ReturnQuota() 490 p.setInstanceUsed(instName, false) 491 } 492 493 log.Printf("Creating GCE VM %q for %s at %s", instName, hostType, zone) 494 attempts := 1 495 for { 496 bc, err = buildlet.StartNewVM(gcpCreds, buildEnv, instName, hostType, buildlet.VMOpts{ 497 DeleteIn: determineDeleteTimeout(hconf), 498 OnInstanceRequested: func() { 499 log.Printf("GCE VM %q now booting", instName) 500 }, 501 OnInstanceCreated: func() { 502 needDelete = true 503 504 createSpan.Done(nil) 505 waitBuildlet = lg.CreateSpan("wait_buildlet_start", instName) 506 curSpan = waitBuildlet 507 }, 508 OnGotInstanceInfo: func(*compute.Instance) { 509 lg.LogEventTime("got_instance_info", "waiting_for_buildlet...") 510 }, 511 Zone: zone, 512 DiskSizeGB: hconf.RootDriveSizeGB, 513 }) 514 if errors.Is(err, buildlet.ErrQuotaExceeded) && ctx.Err() == nil { 515 log.Printf("Failed to create VM because quota exceeded. Retrying after 10 second (attempt: %d).", attempts) 516 attempts++ 517 time.Sleep(10 * time.Second) 518 continue 519 } else if err != nil { 520 curSpan.Done(err) 521 log.Printf("Failed to create VM for %s at %s: %v", hostType, zone, err) 522 cleanup() 523 return nil, err 524 } 525 break 526 } 527 waitBuildlet.Done(nil) 528 bc.SetDescription("GCE VM: " + instName) 529 bc.SetInstanceName(instName) 530 bc.SetOnHeartbeatFailure(cleanup) 531 return bc, nil 532 } 533 534 // WriteHTMLStatus writes the status of the buildlet pool to an io.Writer. 535 func (p *GCEBuildlet) WriteHTMLStatus(w io.Writer) { 536 fmt.Fprintf(w, "<b>GCE pool</b> capacity: %s", p.capacityString()) 537 const show = 6 // must be even 538 active := p.instancesActive() 539 if len(active) > 0 { 540 fmt.Fprintf(w, "<ul>") 541 for i, inst := range active { 542 if i < show/2 || i >= len(active)-(show/2) { 543 fmt.Fprintf(w, "<li>%v, %s</li>\n", inst.Name, friendlyDuration(time.Since(inst.Creation))) 544 } else if i == show/2 { 545 fmt.Fprintf(w, "<li>... %d of %d total omitted ...</li>\n", len(active)-show, len(active)) 546 } 547 } 548 fmt.Fprintf(w, "</ul>") 549 } 550 } 551 552 func (p *GCEBuildlet) String() string { 553 return fmt.Sprintf("GCE pool capacity: %s", p.capacityString()) 554 } 555 556 func (p *GCEBuildlet) capacityString() string { 557 cpuUsage := p.cpuQueue.Quotas() 558 c2Usage := p.c2cpuQueue.Quotas() 559 instUsage := p.instQueue.Quotas() 560 n2Usage := p.n2cpuQueue.Quotas() 561 n2dUsage := p.n2dcpuQueue.Quotas() 562 t2aUsage := p.t2acpuQueue.Quotas() 563 return fmt.Sprintf("%d/%d instances; %d/%d CPUs, %d/%d C2_CPUS, %d/%d N2_CPUS, %d/%d N2D_CPUS %d/%d T2A_CPUS", 564 instUsage.Used, instUsage.Limit, 565 cpuUsage.Used, cpuUsage.Limit, 566 c2Usage.Used, c2Usage.Limit, 567 n2Usage.Used, n2Usage.Limit, 568 n2dUsage.Used, n2dUsage.Limit, 569 t2aUsage.Used, t2aUsage.Limit) 570 } 571 572 func (p *GCEBuildlet) queueForMachineType(mt string) *queue.Quota { 573 if strings.HasPrefix(mt, "n2-") { 574 return p.n2cpuQueue 575 } else if strings.HasPrefix(mt, "n2d-") { 576 return p.n2dcpuQueue 577 } else if strings.HasPrefix(mt, "c2-") { 578 return p.c2cpuQueue 579 } else if strings.HasPrefix(mt, "t2a-") { 580 return p.t2acpuQueue 581 } else { 582 // E2 and N1 instances are counted here. We do not use M1, M2, 583 // or A2 quotas. See 584 // https://cloud.google.com/compute/quotas#cpu_quota. 585 return p.cpuQueue 586 } 587 } 588 589 // returnQuota adjusts the dead-reckoning of our quota usage by 590 // one instance and cpu CPUs. 591 func (p *GCEBuildlet) returnQuota(hconf *dashboard.HostConfig) { 592 machineType := hconf.MachineType() 593 p.queueForMachineType(hconf.MachineType()).ReturnQuota(GCENumCPU(machineType)) 594 p.instQueue.ReturnQuota(1) 595 } 596 597 func (p *GCEBuildlet) setInstanceUsed(instName string, used bool) { 598 p.mu.Lock() 599 defer p.mu.Unlock() 600 if p.inst == nil { 601 p.inst = make(map[string]time.Time) 602 } 603 if used { 604 p.inst[instName] = time.Now() 605 } else { 606 delete(p.inst, instName) 607 } 608 } 609 610 func (p *GCEBuildlet) instanceUsed(instName string) bool { 611 p.mu.Lock() 612 defer p.mu.Unlock() 613 _, ok := p.inst[instName] 614 return ok 615 } 616 617 func (p *GCEBuildlet) instancesActive() (ret []ResourceTime) { 618 p.mu.Lock() 619 defer p.mu.Unlock() 620 for name, create := range p.inst { 621 ret = append(ret, ResourceTime{ 622 Name: name, 623 Creation: create, 624 }) 625 } 626 sort.Sort(ByCreationTime(ret)) 627 return ret 628 } 629 630 // ResourceTime is a GCE instance or Kube pod name and its creation time. 631 type ResourceTime struct { 632 Name string 633 Creation time.Time 634 } 635 636 // ByCreationTime provides the functionality to sort resource times by 637 // the time of creation. 638 type ByCreationTime []ResourceTime 639 640 func (s ByCreationTime) Len() int { return len(s) } 641 func (s ByCreationTime) Less(i, j int) bool { return s[i].Creation.Before(s[j].Creation) } 642 func (s ByCreationTime) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 643 644 // CleanUpOldVMs loops forever and periodically enumerates virtual 645 // machines and deletes those which have expired. 646 // 647 // A VM is considered expired if it has a "delete-at" metadata 648 // attribute having a unix timestamp before the current time. 649 // 650 // This is the safety mechanism to delete VMs which stray from the 651 // normal deleting process. VMs are created to run a single build and 652 // should be shut down by a controlling process. Due to various types 653 // of failures, they might get stranded. To prevent them from getting 654 // stranded and wasting resources forever, we instead set the 655 // "delete-at" metadata attribute on them when created to some time 656 // that's well beyond their expected lifetime. 657 func (p *GCEBuildlet) CleanUpOldVMs() { 658 if gceMode == "dev" { 659 return 660 } 661 if computeService == nil { 662 return 663 } 664 665 // TODO(bradfitz): remove this list and just query it from the compute API? 666 // https://godoc.org/google.golang.org/api/compute/v1#RegionsService.Get 667 // and Region.Zones: https://godoc.org/google.golang.org/api/compute/v1#Region 668 669 for { 670 for _, zone := range buildEnv.VMZones { 671 if err := p.cleanZoneVMs(zone); err != nil { 672 log.Printf("Error cleaning VMs in zone %q: %v", zone, err) 673 } 674 } 675 time.Sleep(time.Minute) 676 } 677 } 678 679 // cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone. 680 func (p *GCEBuildlet) cleanZoneVMs(zone string) error { 681 deletionAPIGate() 682 err := computeService.Instances.List(buildEnv.ProjectName, zone).Pages(context.Background(), func(list *compute.InstanceList) error { 683 for _, inst := range list.Items { 684 if inst.Metadata == nil { 685 // Defensive. Not seen in practice. 686 continue 687 } 688 if isRemoteBuildlet(inst.Name) { 689 // Remote buildlets have their own expiration mechanism that respects active SSH sessions. 690 log.Printf("cleanZoneVMs: skipping remote buildlet %q", inst.Name) 691 continue 692 } 693 var sawDeleteAt bool 694 var deleteReason string 695 for _, it := range inst.Metadata.Items { 696 if it.Key == "delete-at" { 697 if it.Value == nil { 698 log.Printf("missing delete-at value; ignoring") 699 continue 700 } 701 unixDeadline, err := strconv.ParseInt(*it.Value, 10, 64) 702 if err != nil { 703 log.Printf("invalid delete-at value %q seen; ignoring", *it.Value) 704 continue 705 } 706 sawDeleteAt = true 707 if time.Now().Unix() > unixDeadline { 708 deleteReason = "delete-at expiration" 709 } 710 } 711 } 712 isBuildlet := isBuildlet(inst.Name) 713 714 if isBuildlet && !sawDeleteAt && !p.instanceUsed(inst.Name) { 715 createdAt, _ := time.Parse(time.RFC3339Nano, inst.CreationTimestamp) 716 if createdAt.Before(time.Now().Add(-3 * time.Hour)) { 717 deleteReason = fmt.Sprintf("no delete-at, created at %s", inst.CreationTimestamp) 718 } 719 } 720 721 // Delete buildlets (things we made) from previous 722 // generations. Only deleting things starting with "buildlet-" 723 // is a historical restriction, but still fine for paranoia. 724 if deleteReason == "" && sawDeleteAt && isBuildlet && !p.instanceUsed(inst.Name) { 725 if _, ok := deletedVMCache.Get(inst.Name); !ok { 726 deleteReason = "from earlier coordinator generation" 727 } 728 } 729 730 if deleteReason != "" { 731 log.Printf("deleting VM %q in zone %q; %s ...", inst.Name, zone, deleteReason) 732 deleteVM(zone, inst.Name) 733 } 734 } 735 if list.NextPageToken != "" { 736 // Don't use all our quota flipping through pages. 737 deletionAPIGate() 738 } 739 return nil 740 }) 741 if err != nil { 742 return fmt.Errorf("listing instances: %v", err) 743 } 744 return nil 745 } 746 747 var deletedVMCache = lru.New(100) // keyed by instName 748 749 type token struct{} 750 751 // deleteVM starts a delete of an instance in a given zone. 752 // 753 // It either returns an operation name (if delete is pending) or the 754 // empty string if the instance didn't exist. 755 func deleteVM(zone, instName string) (operation string, err error) { 756 deletedVMCache.Add(instName, token{}) 757 deletionAPIGate() 758 op, err := computeService.Instances.Delete(buildEnv.ProjectName, zone, instName).Do() 759 apiErr, ok := err.(*googleapi.Error) 760 if ok { 761 if apiErr.Code == 404 { 762 return "", nil 763 } 764 } 765 if err != nil { 766 log.Printf("Failed to delete instance %q in zone %q: %v", instName, zone, err) 767 return "", err 768 } 769 log.Printf("Sent request to delete instance %q in zone %q. Operation ID, Name: %v, %v", instName, zone, op.Id, op.Name) 770 return op.Name, nil 771 } 772 773 // HasScope returns true if the GCE metadata contains the default scopes. 774 func HasScope(want string) bool { 775 // If not on GCE, assume full access 776 if !metadata.OnGCE() { 777 return true 778 } 779 scopes, err := metadata.Scopes("default") 780 if err != nil { 781 log.Printf("failed to query metadata default scopes: %v", err) 782 return false 783 } 784 for _, v := range scopes { 785 if v == want { 786 return true 787 } 788 } 789 return false 790 } 791 792 func hasComputeScope() bool { 793 return HasScope(compute.ComputeScope) || HasScope(compute.CloudPlatformScope) 794 } 795 796 func hasStorageScope() bool { 797 return HasScope(storage.ScopeReadWrite) || HasScope(storage.ScopeFullControl) || HasScope(compute.CloudPlatformScope) 798 } 799 800 // syncBuildStatsLoop runs forever in its own goroutine and syncs the 801 // coordinator's datastore Build & Span entities to BigQuery 802 // periodically. 803 func syncBuildStatsLoop(env *buildenv.Environment) { 804 ticker := time.NewTicker(5 * time.Minute) 805 for { 806 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) 807 if err := buildstats.SyncBuilds(ctx, env); err != nil { 808 log.Printf("buildstats: SyncBuilds: %v", err) 809 } 810 if err := buildstats.SyncSpans(ctx, env); err != nil { 811 log.Printf("buildstats: SyncSpans: %v", err) 812 } 813 cancel() 814 <-ticker.C 815 } 816 } 817 818 // createBasepinDisks creates zone-local copies of VM disk images, to 819 // speed up VM creations in the future. 820 // 821 // Other than a list call, this a no-op unless new VM images were 822 // added or updated recently. 823 func createBasepinDisks(ctx context.Context) { 824 for { 825 t0 := time.Now() 826 bgc, err := buildgo.NewClient(ctx, buildEnv) 827 if err != nil { 828 log.Printf("basepin: NewClient: %v", err) 829 return 830 } 831 log.Printf("basepin: creating basepin disks...") 832 err = bgc.MakeBasepinDisks(ctx) 833 d := time.Since(t0).Round(time.Second / 10) 834 if err != nil { 835 basePinErr.Store(err.Error()) 836 log.Printf("basepin: error creating basepin disks, after %v: %v", d, err) 837 time.Sleep(5 * time.Minute) 838 continue 839 } 840 basePinErr.Store("") 841 log.Printf("basepin: created basepin disks after %v", d) 842 return 843 } 844 } 845 846 // GCENumCPU returns the number of GCE CPUs used by the specified machine type. 847 func GCENumCPU(machineType string) int { 848 if strings.HasSuffix(machineType, "e2-medium") || strings.HasSuffix(machineType, "e2-small") || strings.HasSuffix(machineType, "e2-micro") { 849 return 2 850 } 851 n, _ := strconv.Atoi(machineType[strings.LastIndex(machineType, "-")+1:]) 852 return n 853 }