golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/makemac/main.go (about)

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Command makemac manages MacService instances for LUCI.
     6  //
     7  // It performs several different operations:
     8  //
     9  // * Detects MacService leases that MacService thinks are running, but never
    10  //   connected to LUCI (failed to boot?) and destroys them.
    11  // * Detects MacService leases that MacService thinks are running, but LUCI
    12  //   thinks are dead (froze/crashed?) and destoys them.
    13  // * Renews MacService leases that both MacService and LUCI agree are healthy
    14  //   to ensure they don't expire.
    15  // * Destroys MacService leases with images that are not requested by the
    16  //   configuration in config.go.
    17  // * Launches new MacService leases to ensure that there are the at least as
    18  //   many leases of each type as specified in the configuration in config.go.
    19  package main
    20  
    21  import (
    22  	"context"
    23  	"flag"
    24  	"fmt"
    25  	"log"
    26  	"regexp"
    27  	"sort"
    28  	"strings"
    29  	"time"
    30  
    31  	"go.chromium.org/luci/swarming/client/swarming"
    32  	spb "go.chromium.org/luci/swarming/proto/api_v2"
    33  	"golang.org/x/build/internal/macservice"
    34  	"golang.org/x/build/internal/secret"
    35  	"golang.org/x/oauth2/google"
    36  )
    37  
    38  var (
    39  	apiKey = secret.Flag("macservice-api-key", "MacService API key")
    40  	period = flag.Duration("period", 1*time.Hour, "How often to check bots and leases. As a special case, -period=0 checks exactly once and then exits")
    41  	dryRun = flag.Bool("dry-run", false, "Print the actions that would be taken without actually performing them")
    42  )
    43  
    44  const (
    45  	createExpirationDuration = 24*time.Hour
    46  	createExpirationDurationString = "86400s"
    47  
    48  	// Shorter renew expiration is a workaround to detect newly-created
    49  	// leases. See comment in handleMissingBots.
    50  	renewExpirationDuration = 23*time.Hour
    51  	renewExpirationDurationString = "82800s" // 23h
    52  )
    53  
    54  const (
    55  	macServiceCustomer = "golang"
    56  
    57  	// Leases managed by makemac have ProjectName "makemac/SWARMING_HOST",
    58  	// indicating that it is managed by makemac, and which swarming host it
    59  	// belongs to. Leases without this project prefix will not be touched.
    60  	//
    61  	// Note that we track the swarming host directly in the lease project
    62  	// name because new leases may not have yet connected to the swarming
    63  	// server, but we still need to know which host to count them towards.
    64  	managedProjectPrefix = "makemac"
    65  )
    66  
    67  func main() {
    68  	secret.InitFlagSupport(context.Background())
    69  	flag.Parse()
    70  
    71  	if err := run(); err != nil {
    72  		log.Fatal(err)
    73  	}
    74  }
    75  
    76  func run() error {
    77  	ctx := context.Background()
    78  
    79  	var mc macServiceClient
    80  	mc = macservice.NewClient(*apiKey)
    81  	if *dryRun {
    82  		mc = readOnlyMacServiceClient{mc: mc}
    83  	}
    84  
    85  	// Use service account / application default credentials for swarming
    86  	// authentication.
    87  	ac, err := google.DefaultClient(ctx)
    88  	if err != nil {
    89  		return fmt.Errorf("error creating authenticated client: %w", err)
    90  	}
    91  
    92  	// Initialize each swarming client.
    93  	for sc, ic := range prodImageConfig {
    94  		c, err := swarming.NewClient(ctx, swarming.ClientOptions{
    95  			ServiceURL:          "https://"+sc.Host,
    96  			AuthenticatedClient: ac,
    97  		})
    98  		if err != nil {
    99  			return fmt.Errorf("error creating swarming client for %s: %w", sc.Host, err)
   100  		}
   101  		sc.client = c
   102  
   103  		logImageConfig(sc, ic)
   104  	}
   105  
   106  	// Always run once at startup.
   107  	runOnce(ctx, prodImageConfig, mc)
   108  
   109  	if *period == 0 {
   110  		// User only wants a single check. We're done.
   111  		return nil
   112  	}
   113  
   114  	t := time.NewTicker(*period)
   115  	for range t.C {
   116  		runOnce(ctx, prodImageConfig, mc)
   117  	}
   118  
   119  	return nil
   120  }
   121  
   122  func runOnce(ctx context.Context, config map[*swarmingConfig][]imageConfig, mc macServiceClient) {
   123  	bots, err := swarmingBots(ctx, config)
   124  	if err != nil {
   125  		log.Printf("Error looking up swarming bots: %v", err)
   126  		return
   127  	}
   128  
   129  	leases, err := macServiceLeases(mc)
   130  	if err != nil {
   131  		log.Printf("Error looking up MacService leases: %v", err)
   132  		return
   133  	}
   134  
   135  	logSummary(bots, leases)
   136  
   137  	// These directly correspond to the operation described in the package
   138  	// comment above.
   139  	handleMissingBots(mc, bots, leases)
   140  	handleDeadBots(mc, bots, leases)
   141  	renewLeases(mc, leases)
   142  	handleObsoleteLeases(mc, config, leases)
   143  	addNewLeases(mc, config, leases)
   144  }
   145  
   146  // leaseSwarmingHost returns the swarming host a managed lease belongs to.
   147  //
   148  // Returns "" if this isn't a managed lease.
   149  func leaseSwarmingHost(l macservice.Lease) string {
   150  	prefix, host, ok := strings.Cut(l.VMResourceNamespace.ProjectName, "/")
   151  	if !ok {
   152  		// Malformed project name, must not be managed.
   153  		return ""
   154  	}
   155  	if prefix != managedProjectPrefix {
   156  		// Some other prefix. Not managed.
   157  		return ""
   158  	}
   159  	return host
   160  }
   161  
   162  func leaseIsManaged(l macservice.Lease) bool {
   163  	return leaseSwarmingHost(l) != ""
   164  }
   165  
   166  func logSummary(bots map[string]*spb.BotInfo, leases map[string]macservice.Instance) {
   167  	keys := make([]string, 0, len(bots))
   168  	for k := range bots {
   169  		keys = append(keys, k)
   170  	}
   171  	sort.Strings(keys)
   172  	log.Printf("Swarming bots:")
   173  	for _, k := range keys {
   174  		b := bots[k]
   175  
   176  		alive := true
   177  		if b.GetIsDead() {
   178  			alive = false
   179  		}
   180  
   181  		os := "<unknown OS version>"
   182  		dimensions := b.GetDimensions()
   183  		for _, d := range dimensions {
   184  			if d.Key != "os" {
   185  				continue
   186  			}
   187  			if len(d.Value) == 0 {
   188  				continue
   189  			}
   190  			os = d.Value[len(d.Value)-1] // most specific value last.
   191  		}
   192  
   193  		log.Printf("\t%s: alive=%t\tos=%s", k, alive, os)
   194  	}
   195  
   196  	keys = make([]string, 0, len(leases))
   197  	for k := range leases {
   198  		keys = append(keys, k)
   199  	}
   200  	sort.Strings(keys)
   201  	log.Printf("MacService leases:")
   202  	for _, k := range keys {
   203  		inst := leases[k]
   204  
   205  		swarming := leaseSwarmingHost(inst.Lease)
   206  		if swarming == "" {
   207  			swarming = "<unmanaged>"
   208  		}
   209  
   210  		image := inst.InstanceSpecification.DiskSelection.ImageHashes.BootSHA256
   211  
   212  		log.Printf("\t%s: image=%s\tswarming=%s", k, image, swarming)
   213  	}
   214  }
   215  
   216  // e.g., darwin-amd64-11--39b47cf6-2aaa-4c80-b9cb-b800844fb104.golang.c3.macservice.goog
   217  var botIDRe = regexp.MustCompile(`.*--([0-9a-f-]+)\.golang\..*\.macservice.goog$`)
   218  
   219  // swarmingBots returns set of bots backed by MacService, as seen by swarming.
   220  // The map key is the MacService lease ID.
   221  // Bots may be dead.
   222  func swarmingBots(ctx context.Context, config map[*swarmingConfig][]imageConfig) (map[string]*spb.BotInfo, error) {
   223  	m := make(map[string]*spb.BotInfo)
   224  
   225  	scs := sortedSwarmingConfigs(config)
   226  	for _, sc := range scs {
   227  		dimensions := []*spb.StringPair{
   228  			{
   229  				Key:   "pool",
   230  				Value: sc.Pool,
   231  			},
   232  			{
   233  				Key:   "os",
   234  				Value: "Mac",
   235  			},
   236  		}
   237  		bb, err := sc.client.ListBots(ctx, dimensions)
   238  		if err != nil {
   239  			return nil, fmt.Errorf("error listing bots: %w", err)
   240  		}
   241  
   242  		for _, b := range bb {
   243  			id := b.GetBotId()
   244  			match := botIDRe.FindStringSubmatch(id)
   245  			if match == nil {
   246  				log.Printf("Swarming bot %s is not a MacService bot, skipping...", id)
   247  				continue
   248  			}
   249  
   250  			lease := match[1]
   251  			m[lease] = b
   252  		}
   253  	}
   254  
   255  	return m, nil
   256  }
   257  
   258  // macServiceLeases returns the set of active MacService leases.
   259  func macServiceLeases(mc macServiceClient) (map[string]macservice.Instance, error) {
   260  	resp, err := mc.Find(macservice.FindRequest{
   261  		VMResourceNamespace: macservice.Namespace{
   262  			CustomerName: "golang",
   263  		},
   264  	})
   265  	if err != nil {
   266  		return nil, fmt.Errorf("error finding leases: %v", err)
   267  	}
   268  
   269  	m := make(map[string]macservice.Instance)
   270  
   271  	for _, i := range resp.Instances {
   272  		m[i.Lease.LeaseID] = i
   273  	}
   274  
   275  	return m, nil
   276  }
   277  
   278  // handleMissingBots detects MacService leases that MacService thinks are
   279  // running, but never connected to LUCI (i.e., missing completely from LUCI)
   280  // and destroys them.
   281  //
   282  // These are bots that perhaps never successfully booted?
   283  func handleMissingBots(mc macServiceClient, bots map[string]*spb.BotInfo, leases map[string]macservice.Instance) {
   284  	log.Printf("Checking for missing bots...")
   285  
   286  	var missing []string
   287  	for id := range leases {
   288  		if _, ok := bots[id]; !ok {
   289  			missing = append(missing, id)
   290  		}
   291  	}
   292  	// Sort to make the logs easier to follow when comparing vs a bot/lease
   293  	// list.
   294  	sort.Strings(missing)
   295  
   296  	for _, id := range missing {
   297  		lease := leases[id]
   298  
   299  		if !leaseIsManaged(lease.Lease) {
   300  			log.Printf("Lease %s missing from LUCI, but not managed by makemac; skipping", id)
   301  			continue
   302  		}
   303  
   304  		// There is a race window here: if this lease was created in
   305  		// the last few minutes, the initial boot may still be ongoing,
   306  		// and thus being missing from LUCI is expected. We don't want
   307  		// to destroy these leases.
   308  		//
   309  		// Unfortunately MacService doesn't report lease creation time,
   310  		// so we can't trivially check for this case. It does report
   311  		// expiration time. As a workaround, we create new leases with
   312  		// a 24h expiration time, but renew leases with a 23h
   313  		// expiration. Thus if we see expiration is >23h from now then
   314  		// this lease must have been created in the last hour.
   315  		untilExpiration := time.Until(lease.Lease.Expires)
   316  		if untilExpiration > renewExpirationDuration {
   317  			log.Printf("Lease %s missing from LUCI, but created in the last hour (still booting?); skipping", id)
   318  			continue
   319  		}
   320  
   321  		log.Printf("Lease %s missing from LUCI; failed initial boot?", id)
   322  		log.Printf("Vacating lease %s...", id)
   323  		if err := mc.Vacate(macservice.VacateRequest{LeaseID: id}); err != nil {
   324  			log.Printf("Error vacating lease %s: %v", id, err)
   325  			continue
   326  		}
   327  		delete(leases, id) // Drop from map so future calls know it is gone.
   328  	}
   329  }
   330  
   331  // handleDeadBots detects MacService leases that MacService thinks are running,
   332  // but LUCI thinks are dead (froze/crashed?) and destoys them.
   333  //
   334  // These are bots that perhaps froze/crashed at some point after starting.
   335  func handleDeadBots(mc macServiceClient, bots map[string]*spb.BotInfo, leases map[string]macservice.Instance) {
   336  	log.Printf("Checking for dead bots...")
   337  
   338  	var dead []string
   339  	for id, b := range bots {
   340  		if b.GetIsDead() {
   341  			dead = append(dead, id)
   342  		}
   343  	}
   344  	// Sort to make the logs easier to follow when comparing vs a bot/lease
   345  	// list.
   346  	sort.Strings(dead)
   347  
   348  	for _, id := range dead {
   349  		lease, ok := leases[id]
   350  		if !ok {
   351  			// Dead bot already gone from MacService; nothing to do.
   352  			continue
   353  		}
   354  
   355  		if !leaseIsManaged(lease.Lease) {
   356  			log.Printf("Lease %s is dead on LUCI, but still present on MacService, but not managed by makemac; skipping", id)
   357  			continue
   358  		}
   359  
   360  		// No need to check for newly created leases like we do in
   361  		// handleMissingBots. If a bot appears as dead on LUCI then it
   362  		// must have successfully connected at some point.
   363  
   364  		log.Printf("Lease %s is dead on LUCI, but still present on MacService; VM froze/crashed?", id)
   365  		log.Printf("Vacating lease %s...", id)
   366  		if err := mc.Vacate(macservice.VacateRequest{LeaseID: id}); err != nil {
   367  			log.Printf("Error vacating lease %s: %v", id, err)
   368  			continue
   369  		}
   370  		delete(leases, id) // Drop from map so future calls know it is gone.
   371  	}
   372  }
   373  
   374  // renewLeases renews lease expiration on all makemac-managed leases. Note that
   375  // this may renew leases that will later be removed because their image is no
   376  // longer required. This is harmless.
   377  func renewLeases(mc macServiceClient, leases map[string]macservice.Instance) {
   378  	log.Printf("Renewing leases...")
   379  
   380  	var ids []string
   381  	for id := range leases {
   382  		ids = append(ids, id)
   383  	}
   384  	// Sort to make the logs easier to follow when comparing vs a bot/lease
   385  	// list.
   386  	sort.Strings(ids)
   387  
   388  	for _, id := range ids {
   389  		lease := leases[id]
   390  
   391  		if !leaseIsManaged(lease.Lease) {
   392  			log.Printf("Lease %s is not managed by makemac; skipping renew", id)
   393  			continue
   394  		}
   395  
   396  		// Extra spaces to make expiration line up with the renewal message below.
   397  		log.Printf("Lease ID: %s currently expires:    %v", lease.Lease.LeaseID, lease.Lease.Expires)
   398  
   399  		// Newly created leases have a longer expiration duration than
   400  		// our renewal expiration duration. Don't renew these, which
   401  		// would would unintentionally shorten their expiration. See
   402  		// comment in handleMissingBots.
   403  		until := time.Until(lease.Lease.Expires)
   404  		if until > renewExpirationDuration {
   405  			log.Printf("Lease ID: %s skip renew, current expiration further out than renew expiration", lease.Lease.LeaseID)
   406  			continue
   407  		}
   408  
   409  		rr, err := mc.Renew(macservice.RenewRequest{
   410  			LeaseID:  lease.Lease.LeaseID,
   411  			Duration: renewExpirationDurationString,
   412  		})
   413  		if err == nil {
   414  			log.Printf("Lease ID: %s renewed, now expires: %v", lease.Lease.LeaseID, rr.Expires)
   415  		} else {
   416  			log.Printf("Lease ID: %s error renewing %v", lease.Lease.LeaseID, err)
   417  		}
   418  	}
   419  }
   420  
   421  // handleObsoleteLeases vacates any makemac-managed leases with images that are
   422  // not requested by imageConfigs. This typically occurs when updating makemac
   423  // to roll out a new image version.
   424  func handleObsoleteLeases(mc macServiceClient, config map[*swarmingConfig][]imageConfig, leases map[string]macservice.Instance) {
   425  	log.Printf("Checking for leases with obsolete images...")
   426  
   427  	// swarming host -> image sha -> image config
   428  	swarmingImages := make(map[string]map[string]*imageConfig)
   429  	for sc, ic := range config {
   430  		swarmingImages[sc.Host] = imageConfigMap(ic)
   431  	}
   432  
   433  	var ids []string
   434  	for id := range leases {
   435  		ids = append(ids, id)
   436  	}
   437  	// Sort to make the logs easier to follow when comparing vs a bot/lease
   438  	// list.
   439  	sort.Strings(ids)
   440  
   441  	for _, id := range ids {
   442  		lease := leases[id]
   443  
   444  		swarming := leaseSwarmingHost(lease.Lease)
   445  		if swarming == "" {
   446  			log.Printf("Lease %s is not managed by makemac; skipping image check", id)
   447  			continue
   448  		}
   449  
   450  		images, ok := swarmingImages[swarming]
   451  		if !ok {
   452  			log.Printf("Lease %s belongs to unknown swarming host %s; skipping image check", id, swarming)
   453  			continue
   454  		}
   455  
   456  		image := lease.InstanceSpecification.DiskSelection.ImageHashes.BootSHA256
   457  		if _, ok := images[image]; ok {
   458  			continue
   459  		}
   460  
   461  		// Config doesn't want instances with this image. Vacate.
   462  		log.Printf("Lease %s uses obsolete image %s", id, image)
   463  		log.Printf("Vacating lease %s...", id)
   464  		if err := mc.Vacate(macservice.VacateRequest{LeaseID: id}); err != nil {
   465  			log.Printf("Error vacating lease %s: %v", id, err)
   466  			continue
   467  		}
   468  		delete(leases, id) // Drop from map so future calls know it is gone.
   469  	}
   470  }
   471  
   472  func makeLeaseRequest(sc *swarmingConfig, ic *imageConfig) (macservice.LeaseRequest, error) {
   473  	cert, err := secret.DefaultResolver.ResolveSecret(ic.Cert)
   474  	if err != nil {
   475  		return macservice.LeaseRequest{}, fmt.Errorf("error resolving certificate secret: %w", err)
   476  	}
   477  	key, err := secret.DefaultResolver.ResolveSecret(ic.Key)
   478  	if err != nil {
   479  		return macservice.LeaseRequest{}, fmt.Errorf("error resolving key secret: %w", err)
   480  	}
   481  
   482  	return macservice.LeaseRequest{
   483  		VMResourceNamespace: macservice.Namespace{
   484  			CustomerName: macServiceCustomer,
   485  			ProjectName:  managedProjectPrefix+"/"+sc.Host,
   486  		},
   487  		InstanceSpecification: macservice.InstanceSpecification{
   488  			Profile: macservice.V1_MEDIUM_VM,
   489  			AccessLevel: macservice.GOLANG_OSS,
   490  			DiskSelection: macservice.DiskSelection{
   491  				ImageHashes: macservice.ImageHashes{
   492  					BootSHA256: ic.Image,
   493  				},
   494  			},
   495  			Metadata: []macservice.MetadataEntry{
   496  				{
   497  					Key:   "golang.swarming",
   498  					Value: sc.Host,
   499  				},
   500  				{
   501  					Key:   "golang.hostname",
   502  					Value: ic.Hostname,
   503  				},
   504  				{
   505  					Key:   "golang.cert",
   506  					Value: cert,
   507  				},
   508  				{
   509  					Key:   "golang.key",
   510  					Value: key,
   511  				},
   512  			},
   513  		},
   514  		Duration: createExpirationDurationString,
   515  	}, nil
   516  }
   517  
   518  // addNewLeases adds new MacService leases as needed to ensure that there are
   519  // at least MinCount makemac-managed leases of each configured image type.
   520  func addNewLeases(mc macServiceClient, config map[*swarmingConfig][]imageConfig, leases map[string]macservice.Instance) {
   521  	log.Printf("Checking if new leases are required...")
   522  
   523  	// Count images per swarming host. Each host gets a different
   524  	// configuration. Map of swarming host -> image sha -> count.
   525  	swarmingImageCount := make(map[string]map[string]int)
   526  	for _, lease := range leases {
   527  		swarming := leaseSwarmingHost(lease.Lease)
   528  		if swarming == "" {
   529  			// Don't count leases we don't manage.
   530  			continue
   531  		}
   532  		if _, ok := swarmingImageCount[swarming]; !ok {
   533  			swarmingImageCount[swarming] = make(map[string]int)
   534  		}
   535  
   536  		image := lease.InstanceSpecification.DiskSelection.ImageHashes.BootSHA256
   537  		swarmingImageCount[swarming][image]++
   538  	}
   539  
   540  	// Iterate through configs in swarming order, then image order.
   541  	swarmingOrder := sortedSwarmingConfigs(config)
   542  	imageMap := make([]map[string]*imageConfig, 0, len(swarmingOrder))
   543  	imageOrder := make([][]string, 0, len(swarmingOrder))
   544  	for _, sc := range swarmingOrder {
   545  		m := imageConfigMap(config[sc])
   546  		order := make([]string, 0, len(m))
   547  		for image := range m {
   548  			order = append(order, image)
   549  		}
   550  		sort.Strings(order)
   551  		imageMap = append(imageMap, m)
   552  		imageOrder = append(imageOrder, order)
   553  	}
   554  
   555  	log.Printf("Current image lease count:")
   556  	for i, sc := range swarmingOrder {
   557  		for _, image := range imageOrder[i] {
   558  			config := imageMap[i][image]
   559  			gotCount := swarmingImageCount[sc.Host][config.Image]
   560  			log.Printf("\tHost %s: image %s: have %d leases\twant %d leases", sc.Host, config.Image, gotCount, config.MinCount)
   561  		}
   562  	}
   563  
   564  	for i, sc := range swarmingOrder {
   565  		for _, image := range imageOrder[i] {
   566  			config := imageMap[i][image]
   567  			gotCount := swarmingImageCount[sc.Host][config.Image]
   568  			need := config.MinCount - gotCount
   569  			if need <= 0 {
   570  				continue
   571  			}
   572  
   573  			log.Printf("Host %s: image %s: creating %d new leases", sc.Host, config.Image, need)
   574  			req, err := makeLeaseRequest(sc, config)
   575  			if err != nil {
   576  				log.Printf("Host %s: image %s: creating lease request: error %v", sc.Host, config.Image, err)
   577  				continue
   578  			}
   579  
   580  			for i := 0; i < need; i++ {
   581  				log.Printf("Host %s: image %s: creating lease %d...", sc.Host, config.Image, i)
   582  				resp, err := mc.Lease(req)
   583  				if err != nil {
   584  					log.Printf("Host %s: image %s: creating lease %d: error %v", sc.Host, config.Image, i, err)
   585  					continue
   586  				}
   587  				log.Printf("Host %s: image %s: created lease %s", sc.Host, config.Image, resp.PendingLease.LeaseID)
   588  			}
   589  		}
   590  	}
   591  }