golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/makemac/main.go (about) 1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Command makemac manages MacService instances for LUCI. 6 // 7 // It performs several different operations: 8 // 9 // * Detects MacService leases that MacService thinks are running, but never 10 // connected to LUCI (failed to boot?) and destroys them. 11 // * Detects MacService leases that MacService thinks are running, but LUCI 12 // thinks are dead (froze/crashed?) and destoys them. 13 // * Renews MacService leases that both MacService and LUCI agree are healthy 14 // to ensure they don't expire. 15 // * Destroys MacService leases with images that are not requested by the 16 // configuration in config.go. 17 // * Launches new MacService leases to ensure that there are the at least as 18 // many leases of each type as specified in the configuration in config.go. 19 package main 20 21 import ( 22 "context" 23 "flag" 24 "fmt" 25 "log" 26 "regexp" 27 "sort" 28 "strings" 29 "time" 30 31 "go.chromium.org/luci/swarming/client/swarming" 32 spb "go.chromium.org/luci/swarming/proto/api_v2" 33 "golang.org/x/build/internal/macservice" 34 "golang.org/x/build/internal/secret" 35 "golang.org/x/oauth2/google" 36 ) 37 38 var ( 39 apiKey = secret.Flag("macservice-api-key", "MacService API key") 40 period = flag.Duration("period", 1*time.Hour, "How often to check bots and leases. As a special case, -period=0 checks exactly once and then exits") 41 dryRun = flag.Bool("dry-run", false, "Print the actions that would be taken without actually performing them") 42 ) 43 44 const ( 45 createExpirationDuration = 24*time.Hour 46 createExpirationDurationString = "86400s" 47 48 // Shorter renew expiration is a workaround to detect newly-created 49 // leases. See comment in handleMissingBots. 50 renewExpirationDuration = 23*time.Hour 51 renewExpirationDurationString = "82800s" // 23h 52 ) 53 54 const ( 55 macServiceCustomer = "golang" 56 57 // Leases managed by makemac have ProjectName "makemac/SWARMING_HOST", 58 // indicating that it is managed by makemac, and which swarming host it 59 // belongs to. Leases without this project prefix will not be touched. 60 // 61 // Note that we track the swarming host directly in the lease project 62 // name because new leases may not have yet connected to the swarming 63 // server, but we still need to know which host to count them towards. 64 managedProjectPrefix = "makemac" 65 ) 66 67 func main() { 68 secret.InitFlagSupport(context.Background()) 69 flag.Parse() 70 71 if err := run(); err != nil { 72 log.Fatal(err) 73 } 74 } 75 76 func run() error { 77 ctx := context.Background() 78 79 var mc macServiceClient 80 mc = macservice.NewClient(*apiKey) 81 if *dryRun { 82 mc = readOnlyMacServiceClient{mc: mc} 83 } 84 85 // Use service account / application default credentials for swarming 86 // authentication. 87 ac, err := google.DefaultClient(ctx) 88 if err != nil { 89 return fmt.Errorf("error creating authenticated client: %w", err) 90 } 91 92 // Initialize each swarming client. 93 for sc, ic := range prodImageConfig { 94 c, err := swarming.NewClient(ctx, swarming.ClientOptions{ 95 ServiceURL: "https://"+sc.Host, 96 AuthenticatedClient: ac, 97 }) 98 if err != nil { 99 return fmt.Errorf("error creating swarming client for %s: %w", sc.Host, err) 100 } 101 sc.client = c 102 103 logImageConfig(sc, ic) 104 } 105 106 // Always run once at startup. 107 runOnce(ctx, prodImageConfig, mc) 108 109 if *period == 0 { 110 // User only wants a single check. We're done. 111 return nil 112 } 113 114 t := time.NewTicker(*period) 115 for range t.C { 116 runOnce(ctx, prodImageConfig, mc) 117 } 118 119 return nil 120 } 121 122 func runOnce(ctx context.Context, config map[*swarmingConfig][]imageConfig, mc macServiceClient) { 123 bots, err := swarmingBots(ctx, config) 124 if err != nil { 125 log.Printf("Error looking up swarming bots: %v", err) 126 return 127 } 128 129 leases, err := macServiceLeases(mc) 130 if err != nil { 131 log.Printf("Error looking up MacService leases: %v", err) 132 return 133 } 134 135 logSummary(bots, leases) 136 137 // These directly correspond to the operation described in the package 138 // comment above. 139 handleMissingBots(mc, bots, leases) 140 handleDeadBots(mc, bots, leases) 141 renewLeases(mc, leases) 142 handleObsoleteLeases(mc, config, leases) 143 addNewLeases(mc, config, leases) 144 } 145 146 // leaseSwarmingHost returns the swarming host a managed lease belongs to. 147 // 148 // Returns "" if this isn't a managed lease. 149 func leaseSwarmingHost(l macservice.Lease) string { 150 prefix, host, ok := strings.Cut(l.VMResourceNamespace.ProjectName, "/") 151 if !ok { 152 // Malformed project name, must not be managed. 153 return "" 154 } 155 if prefix != managedProjectPrefix { 156 // Some other prefix. Not managed. 157 return "" 158 } 159 return host 160 } 161 162 func leaseIsManaged(l macservice.Lease) bool { 163 return leaseSwarmingHost(l) != "" 164 } 165 166 func logSummary(bots map[string]*spb.BotInfo, leases map[string]macservice.Instance) { 167 keys := make([]string, 0, len(bots)) 168 for k := range bots { 169 keys = append(keys, k) 170 } 171 sort.Strings(keys) 172 log.Printf("Swarming bots:") 173 for _, k := range keys { 174 b := bots[k] 175 176 alive := true 177 if b.GetIsDead() { 178 alive = false 179 } 180 181 os := "<unknown OS version>" 182 dimensions := b.GetDimensions() 183 for _, d := range dimensions { 184 if d.Key != "os" { 185 continue 186 } 187 if len(d.Value) == 0 { 188 continue 189 } 190 os = d.Value[len(d.Value)-1] // most specific value last. 191 } 192 193 log.Printf("\t%s: alive=%t\tos=%s", k, alive, os) 194 } 195 196 keys = make([]string, 0, len(leases)) 197 for k := range leases { 198 keys = append(keys, k) 199 } 200 sort.Strings(keys) 201 log.Printf("MacService leases:") 202 for _, k := range keys { 203 inst := leases[k] 204 205 swarming := leaseSwarmingHost(inst.Lease) 206 if swarming == "" { 207 swarming = "<unmanaged>" 208 } 209 210 image := inst.InstanceSpecification.DiskSelection.ImageHashes.BootSHA256 211 212 log.Printf("\t%s: image=%s\tswarming=%s", k, image, swarming) 213 } 214 } 215 216 // e.g., darwin-amd64-11--39b47cf6-2aaa-4c80-b9cb-b800844fb104.golang.c3.macservice.goog 217 var botIDRe = regexp.MustCompile(`.*--([0-9a-f-]+)\.golang\..*\.macservice.goog$`) 218 219 // swarmingBots returns set of bots backed by MacService, as seen by swarming. 220 // The map key is the MacService lease ID. 221 // Bots may be dead. 222 func swarmingBots(ctx context.Context, config map[*swarmingConfig][]imageConfig) (map[string]*spb.BotInfo, error) { 223 m := make(map[string]*spb.BotInfo) 224 225 scs := sortedSwarmingConfigs(config) 226 for _, sc := range scs { 227 dimensions := []*spb.StringPair{ 228 { 229 Key: "pool", 230 Value: sc.Pool, 231 }, 232 { 233 Key: "os", 234 Value: "Mac", 235 }, 236 } 237 bb, err := sc.client.ListBots(ctx, dimensions) 238 if err != nil { 239 return nil, fmt.Errorf("error listing bots: %w", err) 240 } 241 242 for _, b := range bb { 243 id := b.GetBotId() 244 match := botIDRe.FindStringSubmatch(id) 245 if match == nil { 246 log.Printf("Swarming bot %s is not a MacService bot, skipping...", id) 247 continue 248 } 249 250 lease := match[1] 251 m[lease] = b 252 } 253 } 254 255 return m, nil 256 } 257 258 // macServiceLeases returns the set of active MacService leases. 259 func macServiceLeases(mc macServiceClient) (map[string]macservice.Instance, error) { 260 resp, err := mc.Find(macservice.FindRequest{ 261 VMResourceNamespace: macservice.Namespace{ 262 CustomerName: "golang", 263 }, 264 }) 265 if err != nil { 266 return nil, fmt.Errorf("error finding leases: %v", err) 267 } 268 269 m := make(map[string]macservice.Instance) 270 271 for _, i := range resp.Instances { 272 m[i.Lease.LeaseID] = i 273 } 274 275 return m, nil 276 } 277 278 // handleMissingBots detects MacService leases that MacService thinks are 279 // running, but never connected to LUCI (i.e., missing completely from LUCI) 280 // and destroys them. 281 // 282 // These are bots that perhaps never successfully booted? 283 func handleMissingBots(mc macServiceClient, bots map[string]*spb.BotInfo, leases map[string]macservice.Instance) { 284 log.Printf("Checking for missing bots...") 285 286 var missing []string 287 for id := range leases { 288 if _, ok := bots[id]; !ok { 289 missing = append(missing, id) 290 } 291 } 292 // Sort to make the logs easier to follow when comparing vs a bot/lease 293 // list. 294 sort.Strings(missing) 295 296 for _, id := range missing { 297 lease := leases[id] 298 299 if !leaseIsManaged(lease.Lease) { 300 log.Printf("Lease %s missing from LUCI, but not managed by makemac; skipping", id) 301 continue 302 } 303 304 // There is a race window here: if this lease was created in 305 // the last few minutes, the initial boot may still be ongoing, 306 // and thus being missing from LUCI is expected. We don't want 307 // to destroy these leases. 308 // 309 // Unfortunately MacService doesn't report lease creation time, 310 // so we can't trivially check for this case. It does report 311 // expiration time. As a workaround, we create new leases with 312 // a 24h expiration time, but renew leases with a 23h 313 // expiration. Thus if we see expiration is >23h from now then 314 // this lease must have been created in the last hour. 315 untilExpiration := time.Until(lease.Lease.Expires) 316 if untilExpiration > renewExpirationDuration { 317 log.Printf("Lease %s missing from LUCI, but created in the last hour (still booting?); skipping", id) 318 continue 319 } 320 321 log.Printf("Lease %s missing from LUCI; failed initial boot?", id) 322 log.Printf("Vacating lease %s...", id) 323 if err := mc.Vacate(macservice.VacateRequest{LeaseID: id}); err != nil { 324 log.Printf("Error vacating lease %s: %v", id, err) 325 continue 326 } 327 delete(leases, id) // Drop from map so future calls know it is gone. 328 } 329 } 330 331 // handleDeadBots detects MacService leases that MacService thinks are running, 332 // but LUCI thinks are dead (froze/crashed?) and destoys them. 333 // 334 // These are bots that perhaps froze/crashed at some point after starting. 335 func handleDeadBots(mc macServiceClient, bots map[string]*spb.BotInfo, leases map[string]macservice.Instance) { 336 log.Printf("Checking for dead bots...") 337 338 var dead []string 339 for id, b := range bots { 340 if b.GetIsDead() { 341 dead = append(dead, id) 342 } 343 } 344 // Sort to make the logs easier to follow when comparing vs a bot/lease 345 // list. 346 sort.Strings(dead) 347 348 for _, id := range dead { 349 lease, ok := leases[id] 350 if !ok { 351 // Dead bot already gone from MacService; nothing to do. 352 continue 353 } 354 355 if !leaseIsManaged(lease.Lease) { 356 log.Printf("Lease %s is dead on LUCI, but still present on MacService, but not managed by makemac; skipping", id) 357 continue 358 } 359 360 // No need to check for newly created leases like we do in 361 // handleMissingBots. If a bot appears as dead on LUCI then it 362 // must have successfully connected at some point. 363 364 log.Printf("Lease %s is dead on LUCI, but still present on MacService; VM froze/crashed?", id) 365 log.Printf("Vacating lease %s...", id) 366 if err := mc.Vacate(macservice.VacateRequest{LeaseID: id}); err != nil { 367 log.Printf("Error vacating lease %s: %v", id, err) 368 continue 369 } 370 delete(leases, id) // Drop from map so future calls know it is gone. 371 } 372 } 373 374 // renewLeases renews lease expiration on all makemac-managed leases. Note that 375 // this may renew leases that will later be removed because their image is no 376 // longer required. This is harmless. 377 func renewLeases(mc macServiceClient, leases map[string]macservice.Instance) { 378 log.Printf("Renewing leases...") 379 380 var ids []string 381 for id := range leases { 382 ids = append(ids, id) 383 } 384 // Sort to make the logs easier to follow when comparing vs a bot/lease 385 // list. 386 sort.Strings(ids) 387 388 for _, id := range ids { 389 lease := leases[id] 390 391 if !leaseIsManaged(lease.Lease) { 392 log.Printf("Lease %s is not managed by makemac; skipping renew", id) 393 continue 394 } 395 396 // Extra spaces to make expiration line up with the renewal message below. 397 log.Printf("Lease ID: %s currently expires: %v", lease.Lease.LeaseID, lease.Lease.Expires) 398 399 // Newly created leases have a longer expiration duration than 400 // our renewal expiration duration. Don't renew these, which 401 // would would unintentionally shorten their expiration. See 402 // comment in handleMissingBots. 403 until := time.Until(lease.Lease.Expires) 404 if until > renewExpirationDuration { 405 log.Printf("Lease ID: %s skip renew, current expiration further out than renew expiration", lease.Lease.LeaseID) 406 continue 407 } 408 409 rr, err := mc.Renew(macservice.RenewRequest{ 410 LeaseID: lease.Lease.LeaseID, 411 Duration: renewExpirationDurationString, 412 }) 413 if err == nil { 414 log.Printf("Lease ID: %s renewed, now expires: %v", lease.Lease.LeaseID, rr.Expires) 415 } else { 416 log.Printf("Lease ID: %s error renewing %v", lease.Lease.LeaseID, err) 417 } 418 } 419 } 420 421 // handleObsoleteLeases vacates any makemac-managed leases with images that are 422 // not requested by imageConfigs. This typically occurs when updating makemac 423 // to roll out a new image version. 424 func handleObsoleteLeases(mc macServiceClient, config map[*swarmingConfig][]imageConfig, leases map[string]macservice.Instance) { 425 log.Printf("Checking for leases with obsolete images...") 426 427 // swarming host -> image sha -> image config 428 swarmingImages := make(map[string]map[string]*imageConfig) 429 for sc, ic := range config { 430 swarmingImages[sc.Host] = imageConfigMap(ic) 431 } 432 433 var ids []string 434 for id := range leases { 435 ids = append(ids, id) 436 } 437 // Sort to make the logs easier to follow when comparing vs a bot/lease 438 // list. 439 sort.Strings(ids) 440 441 for _, id := range ids { 442 lease := leases[id] 443 444 swarming := leaseSwarmingHost(lease.Lease) 445 if swarming == "" { 446 log.Printf("Lease %s is not managed by makemac; skipping image check", id) 447 continue 448 } 449 450 images, ok := swarmingImages[swarming] 451 if !ok { 452 log.Printf("Lease %s belongs to unknown swarming host %s; skipping image check", id, swarming) 453 continue 454 } 455 456 image := lease.InstanceSpecification.DiskSelection.ImageHashes.BootSHA256 457 if _, ok := images[image]; ok { 458 continue 459 } 460 461 // Config doesn't want instances with this image. Vacate. 462 log.Printf("Lease %s uses obsolete image %s", id, image) 463 log.Printf("Vacating lease %s...", id) 464 if err := mc.Vacate(macservice.VacateRequest{LeaseID: id}); err != nil { 465 log.Printf("Error vacating lease %s: %v", id, err) 466 continue 467 } 468 delete(leases, id) // Drop from map so future calls know it is gone. 469 } 470 } 471 472 func makeLeaseRequest(sc *swarmingConfig, ic *imageConfig) (macservice.LeaseRequest, error) { 473 cert, err := secret.DefaultResolver.ResolveSecret(ic.Cert) 474 if err != nil { 475 return macservice.LeaseRequest{}, fmt.Errorf("error resolving certificate secret: %w", err) 476 } 477 key, err := secret.DefaultResolver.ResolveSecret(ic.Key) 478 if err != nil { 479 return macservice.LeaseRequest{}, fmt.Errorf("error resolving key secret: %w", err) 480 } 481 482 return macservice.LeaseRequest{ 483 VMResourceNamespace: macservice.Namespace{ 484 CustomerName: macServiceCustomer, 485 ProjectName: managedProjectPrefix+"/"+sc.Host, 486 }, 487 InstanceSpecification: macservice.InstanceSpecification{ 488 Profile: macservice.V1_MEDIUM_VM, 489 AccessLevel: macservice.GOLANG_OSS, 490 DiskSelection: macservice.DiskSelection{ 491 ImageHashes: macservice.ImageHashes{ 492 BootSHA256: ic.Image, 493 }, 494 }, 495 Metadata: []macservice.MetadataEntry{ 496 { 497 Key: "golang.swarming", 498 Value: sc.Host, 499 }, 500 { 501 Key: "golang.hostname", 502 Value: ic.Hostname, 503 }, 504 { 505 Key: "golang.cert", 506 Value: cert, 507 }, 508 { 509 Key: "golang.key", 510 Value: key, 511 }, 512 }, 513 }, 514 Duration: createExpirationDurationString, 515 }, nil 516 } 517 518 // addNewLeases adds new MacService leases as needed to ensure that there are 519 // at least MinCount makemac-managed leases of each configured image type. 520 func addNewLeases(mc macServiceClient, config map[*swarmingConfig][]imageConfig, leases map[string]macservice.Instance) { 521 log.Printf("Checking if new leases are required...") 522 523 // Count images per swarming host. Each host gets a different 524 // configuration. Map of swarming host -> image sha -> count. 525 swarmingImageCount := make(map[string]map[string]int) 526 for _, lease := range leases { 527 swarming := leaseSwarmingHost(lease.Lease) 528 if swarming == "" { 529 // Don't count leases we don't manage. 530 continue 531 } 532 if _, ok := swarmingImageCount[swarming]; !ok { 533 swarmingImageCount[swarming] = make(map[string]int) 534 } 535 536 image := lease.InstanceSpecification.DiskSelection.ImageHashes.BootSHA256 537 swarmingImageCount[swarming][image]++ 538 } 539 540 // Iterate through configs in swarming order, then image order. 541 swarmingOrder := sortedSwarmingConfigs(config) 542 imageMap := make([]map[string]*imageConfig, 0, len(swarmingOrder)) 543 imageOrder := make([][]string, 0, len(swarmingOrder)) 544 for _, sc := range swarmingOrder { 545 m := imageConfigMap(config[sc]) 546 order := make([]string, 0, len(m)) 547 for image := range m { 548 order = append(order, image) 549 } 550 sort.Strings(order) 551 imageMap = append(imageMap, m) 552 imageOrder = append(imageOrder, order) 553 } 554 555 log.Printf("Current image lease count:") 556 for i, sc := range swarmingOrder { 557 for _, image := range imageOrder[i] { 558 config := imageMap[i][image] 559 gotCount := swarmingImageCount[sc.Host][config.Image] 560 log.Printf("\tHost %s: image %s: have %d leases\twant %d leases", sc.Host, config.Image, gotCount, config.MinCount) 561 } 562 } 563 564 for i, sc := range swarmingOrder { 565 for _, image := range imageOrder[i] { 566 config := imageMap[i][image] 567 gotCount := swarmingImageCount[sc.Host][config.Image] 568 need := config.MinCount - gotCount 569 if need <= 0 { 570 continue 571 } 572 573 log.Printf("Host %s: image %s: creating %d new leases", sc.Host, config.Image, need) 574 req, err := makeLeaseRequest(sc, config) 575 if err != nil { 576 log.Printf("Host %s: image %s: creating lease request: error %v", sc.Host, config.Image, err) 577 continue 578 } 579 580 for i := 0; i < need; i++ { 581 log.Printf("Host %s: image %s: creating lease %d...", sc.Host, config.Image, i) 582 resp, err := mc.Lease(req) 583 if err != nil { 584 log.Printf("Host %s: image %s: creating lease %d: error %v", sc.Host, config.Image, i, err) 585 continue 586 } 587 log.Printf("Host %s: image %s: created lease %s", sc.Host, config.Image, resp.PendingLease.LeaseID) 588 } 589 } 590 } 591 }