golang.org/x/build@v0.0.0-20240506185731-218518f32b70/internal/coordinator/pool/ec2.go (about)

     1  // Copyright 2020 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build linux || darwin
     6  
     7  package pool
     8  
     9  import (
    10  	"context"
    11  	"errors"
    12  	"fmt"
    13  	"html"
    14  	"io"
    15  	"log"
    16  	"sync"
    17  	"time"
    18  
    19  	"golang.org/x/build/buildenv"
    20  	"golang.org/x/build/buildlet"
    21  	"golang.org/x/build/dashboard"
    22  	"golang.org/x/build/internal"
    23  	"golang.org/x/build/internal/cloud"
    24  	"golang.org/x/build/internal/coordinator/pool/queue"
    25  	"golang.org/x/build/internal/spanlog"
    26  )
    27  
    28  var _ Buildlet = (*EC2Buildlet)(nil)
    29  
    30  // ec2Buildlet is the package level buildlet pool.
    31  //
    32  // TODO(golang.org/issues/38337) remove once a package level variable is no longer
    33  // required by the main package.
    34  var ec2Buildlet *EC2Buildlet
    35  
    36  // EC2BuildetPool retrieves the package level EC2Buildlet pool set by the constructor.
    37  //
    38  // TODO(golang.org/issues/38337) remove once a package level variable is no longer
    39  // required by the main package.
    40  func EC2BuildetPool() *EC2Buildlet {
    41  	return ec2Buildlet
    42  }
    43  
    44  func init() {
    45  	// initializes a basic package level ec2Buildlet pool to enable basic testing in other
    46  	// packages.
    47  	//
    48  	// TODO(golang.org/issues/38337) remove once a package level variable is no longer
    49  	// required by the main package.
    50  	ec2Buildlet = &EC2Buildlet{
    51  		ledger: newLedger(),
    52  	}
    53  }
    54  
    55  // awsClient represents the aws client used to interact with AWS. This is a partial
    56  // implementation of pool.AWSClient.
    57  type awsClient interface {
    58  	DestroyInstances(ctx context.Context, instIDs ...string) error
    59  	Quota(ctx context.Context, service, code string) (int64, error)
    60  	InstanceTypesARM(ctx context.Context) ([]*cloud.InstanceType, error)
    61  	RunningInstances(ctx context.Context) ([]*cloud.Instance, error)
    62  }
    63  
    64  // EC2Opt is optional configuration for the buildlet.
    65  type EC2Opt func(*EC2Buildlet)
    66  
    67  // EC2Buildlet manages a pool of AWS EC2 buildlets.
    68  type EC2Buildlet struct {
    69  	// awsClient is the client used to interact with AWS services.
    70  	awsClient awsClient
    71  	// buildEnv contains the build environment settings.
    72  	buildEnv *buildenv.Environment
    73  	// buildletClient is the client used to create a buildlet.
    74  	buildletClient ec2BuildletClient
    75  	// hosts provides the host configuration for all hosts. It is passed in to facilitate
    76  	// testing.
    77  	hosts map[string]*dashboard.HostConfig
    78  	// isRemoteBuildletFunc informs the caller is a VM instance is being used as a remote
    79  	// buildlet.
    80  	//
    81  	// TODO(golang.org/issues/38337) remove once we find a way to pass in remote buildlet
    82  	// information at the get buidlet request.
    83  	isRemoteBuildlet IsRemoteBuildletFunc
    84  	// ledger tracks instances and their resource allocations.
    85  	ledger *ledger
    86  	// cancelPoll will signal to the pollers to discontinue polling.
    87  	cancelPoll context.CancelFunc
    88  	// pollWait waits for all pollers to terminate polling.
    89  	pollWait sync.WaitGroup
    90  }
    91  
    92  // ec2BuildletClient represents an EC2 buildlet client in the buildlet package.
    93  type ec2BuildletClient interface {
    94  	StartNewVM(ctx context.Context, buildEnv *buildenv.Environment, hconf *dashboard.HostConfig, vmName, hostType string, opts *buildlet.VMOpts) (buildlet.Client, error)
    95  }
    96  
    97  // NewEC2Buildlet creates a new EC2 buildlet pool used to create and manage the lifecycle of
    98  // EC2 buildlets. Information about ARM64 instance types is retrieved before starting the pool.
    99  // EC2 quota types are also retrieved before starting the pool. The pool will continuously poll
   100  // for quotas which limit the resources that can be consumed by the pool. It will also periodically
   101  // search for VMs which are no longer in use or are untracked by the pool in order to delete them.
   102  func NewEC2Buildlet(client *cloud.AWSClient, buildEnv *buildenv.Environment, hosts map[string]*dashboard.HostConfig, fn IsRemoteBuildletFunc, opts ...EC2Opt) (*EC2Buildlet, error) {
   103  	if fn == nil {
   104  		return nil, errors.New("remote buildlet check function is not set")
   105  	}
   106  	ctx, cancel := context.WithCancel(context.Background())
   107  	b := &EC2Buildlet{
   108  		awsClient:        client,
   109  		buildEnv:         buildEnv,
   110  		buildletClient:   buildlet.NewEC2Client(client),
   111  		cancelPoll:       cancel,
   112  		hosts:            hosts,
   113  		isRemoteBuildlet: fn,
   114  		ledger:           newLedger(),
   115  	}
   116  	for _, opt := range opts {
   117  		opt(b)
   118  	}
   119  	if err := b.retrieveAndSetQuota(ctx); err != nil {
   120  		return nil, fmt.Errorf("unable to create EC2 pool: %w", err)
   121  	}
   122  	if err := b.retrieveAndSetInstanceTypes(); err != nil {
   123  		return nil, fmt.Errorf("unable to create EC2 pool: %w", err)
   124  	}
   125  
   126  	b.pollWait.Add(1)
   127  	// polls for the EC2 quota data and sets the quota data in
   128  	// the ledger. When the context has been cancelled, the polling will stop.
   129  	go func() {
   130  		go internal.PeriodicallyDo(ctx, time.Hour, func(ctx context.Context, _ time.Time) {
   131  			log.Printf("retrieveing EC2 quota")
   132  			_ = b.retrieveAndSetQuota(ctx)
   133  		})
   134  		b.pollWait.Done()
   135  	}()
   136  
   137  	b.pollWait.Add(1)
   138  	// poll queries for VMs which are not tracked in the ledger and
   139  	// deletes them. When the context has been cancelled, the polling will stop.
   140  	go func() {
   141  		go internal.PeriodicallyDo(ctx, 2*time.Minute, func(ctx context.Context, _ time.Time) {
   142  			log.Printf("cleaning up unused EC2 instances")
   143  			b.destroyUntrackedInstances(ctx)
   144  		})
   145  		b.pollWait.Done()
   146  	}()
   147  
   148  	// TODO(golang.org/issues/38337) remove once a package level variable is no longer
   149  	// required by the main package.
   150  	ec2Buildlet = b
   151  	return b, nil
   152  }
   153  
   154  // GetBuildlet retrieves a buildlet client for a newly created buildlet.
   155  func (eb *EC2Buildlet) GetBuildlet(ctx context.Context, hostType string, lg Logger, si *queue.SchedItem) (buildlet.Client, error) {
   156  	hconf, ok := eb.hosts[hostType]
   157  	if !ok {
   158  		return nil, fmt.Errorf("ec2 pool: unknown host type %q", hostType)
   159  	}
   160  	instName := instanceName(hostType, 7)
   161  	log.Printf("Creating EC2 VM %q for %s", instName, hostType)
   162  	kp, err := buildlet.NewKeyPair()
   163  	if err != nil {
   164  		log.Printf("failed to create TLS key pair for %s: %s", hostType, err)
   165  		return nil, fmt.Errorf("failed to create TLS key pair: %w", err)
   166  	}
   167  
   168  	qsp := lg.CreateSpan("awaiting_ec2_quota")
   169  	err = eb.ledger.ReserveResources(ctx, instName, hconf.MachineType(), si)
   170  	qsp.Done(err)
   171  	if err != nil {
   172  		return nil, err
   173  	}
   174  
   175  	ec2BuildletSpan := lg.CreateSpan("create_ec2_buildlet", instName)
   176  	defer func() { ec2BuildletSpan.Done(err) }()
   177  
   178  	var (
   179  		createSpan      = lg.CreateSpan("create_ec2_instance", instName)
   180  		waitBuildlet    spanlog.Span
   181  		curSpan         = createSpan
   182  		instanceCreated bool
   183  	)
   184  	bc, err := eb.buildletClient.StartNewVM(ctx, eb.buildEnv, hconf, instName, hostType, &buildlet.VMOpts{
   185  		Zone:     "", // allow the EC2 api pick an availability zone with capacity
   186  		TLS:      kp,
   187  		Meta:     make(map[string]string),
   188  		DeleteIn: determineDeleteTimeout(hconf),
   189  		OnInstanceRequested: func() {
   190  			log.Printf("EC2 VM %q now booting", instName)
   191  		},
   192  		OnInstanceCreated: func() {
   193  			log.Printf("EC2 VM %q now running", instName)
   194  			createSpan.Done(nil)
   195  			instanceCreated = true
   196  			waitBuildlet = lg.CreateSpan("wait_buildlet_start", instName)
   197  			curSpan = waitBuildlet
   198  		},
   199  		OnGotEC2InstanceInfo: func(inst *cloud.Instance) {
   200  			lg.LogEventTime("got_instance_info", "waiting_for_buildlet...")
   201  			eb.ledger.UpdateReservation(instName, inst.ID)
   202  		},
   203  	})
   204  	if err != nil {
   205  		curSpan.Done(err)
   206  		log.Printf("EC2 VM creation failed for %s: %v", hostType, err)
   207  		if instanceCreated {
   208  			log.Printf("EC2 VM %q failed initialize buildlet client. deleting...", instName)
   209  			eb.buildletDone(instName)
   210  		} else {
   211  			eb.ledger.Remove(instName)
   212  		}
   213  		return nil, err
   214  	}
   215  	waitBuildlet.Done(nil)
   216  	bc.SetDescription(fmt.Sprintf("EC2 VM: %s", instName))
   217  	bc.SetOnHeartbeatFailure(func() {
   218  		log.Printf("EC2 VM %q failed heartbeat", instName)
   219  		eb.buildletDone(instName)
   220  	})
   221  	bc.SetInstanceName(instName)
   222  	return bc, nil
   223  }
   224  
   225  func (eb *EC2Buildlet) QuotaStats() map[string]*queue.QuotaStats {
   226  	return map[string]*queue.QuotaStats{
   227  		"ec2-cpu": eb.ledger.cpuQueue.ToExported(),
   228  	}
   229  }
   230  
   231  // String gives a report of capacity usage for the EC2 buildlet pool.
   232  func (eb *EC2Buildlet) String() string {
   233  	return fmt.Sprintf("EC2 pool capacity: %s", eb.capacityString())
   234  }
   235  
   236  // capacityString() gives a report of capacity usage.
   237  func (eb *EC2Buildlet) capacityString() string {
   238  	r := eb.ledger.Resources()
   239  	return fmt.Sprintf("%d instances; %d/%d CPUs", r.InstCount, r.CPUUsed, r.CPULimit)
   240  }
   241  
   242  // WriteHTMLStatus writes the status of the EC2 buildlet pool to an io.Writer.
   243  func (eb *EC2Buildlet) WriteHTMLStatus(w io.Writer) {
   244  	fmt.Fprintf(w, "<b>EC2 pool</b> capacity: %s", eb.capacityString())
   245  
   246  	active := eb.ledger.ResourceTime()
   247  	if len(active) > 0 {
   248  		fmt.Fprintf(w, "<ul>")
   249  		for _, inst := range active {
   250  			fmt.Fprintf(w, "<li>%v, %s</li>\n", html.EscapeString(inst.Name), friendlyDuration(time.Since(inst.Creation)))
   251  		}
   252  		fmt.Fprintf(w, "</ul>")
   253  	}
   254  }
   255  
   256  // buildletDone issues a call to destroy the EC2 instance and removes
   257  // the instance from the ledger. Removing the instance from the ledger
   258  // also releases any resources allocated to that instance. If an instance
   259  // is not found in the ledger or on EC2 then an error is logged. All
   260  // untracked instances will be cleaned up by the polling cleanupUnusedVMs
   261  // method.
   262  func (eb *EC2Buildlet) buildletDone(instName string) {
   263  	vmID := eb.ledger.InstanceID(instName)
   264  	if vmID == "" {
   265  		log.Printf("EC2 vm %s not found", instName)
   266  		return
   267  	}
   268  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   269  	defer cancel()
   270  	if err := eb.awsClient.DestroyInstances(ctx, vmID); err != nil {
   271  		log.Printf("EC2 VM %s deletion failed: %s", instName, err)
   272  	}
   273  	eb.ledger.Remove(instName)
   274  }
   275  
   276  // Close stops the pollers used by the EC2Buildlet pool from running.
   277  func (eb *EC2Buildlet) Close() {
   278  	eb.cancelPoll()
   279  	eb.pollWait.Wait()
   280  }
   281  
   282  // retrieveAndSetQuota queries EC2 for account relevant quotas and sets the quota in the ledger.
   283  func (eb *EC2Buildlet) retrieveAndSetQuota(ctx context.Context) error {
   284  	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
   285  	defer cancel()
   286  
   287  	cpuQuota, err := eb.awsClient.Quota(ctx, cloud.QuotaServiceEC2, cloud.QuotaCodeCPUOnDemand)
   288  	if err != nil {
   289  		log.Printf("unable to query for EC2 cpu quota: %s", err)
   290  		return err
   291  	}
   292  	eb.ledger.SetCPULimit(cpuQuota)
   293  	return nil
   294  }
   295  
   296  // retrieveAndSetInstanceTypes retrieves the ARM64 instance types from the EC2
   297  // service and sets them in the ledger.
   298  func (eb *EC2Buildlet) retrieveAndSetInstanceTypes() error {
   299  	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   300  	defer cancel()
   301  
   302  	its, err := eb.awsClient.InstanceTypesARM(ctx)
   303  	if err != nil {
   304  		return fmt.Errorf("unable to retrieve EC2 instance types: %w", err)
   305  	}
   306  	eb.ledger.UpdateInstanceTypes(its)
   307  	log.Printf("ec2 buildlet pool instance types updated")
   308  	return nil
   309  }
   310  
   311  // destroyUntrackedInstances searches for VMs which exist but are not being tracked in the
   312  // ledger and deletes them.
   313  func (eb *EC2Buildlet) destroyUntrackedInstances(ctx context.Context) {
   314  	ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
   315  	defer cancel()
   316  
   317  	insts, err := eb.awsClient.RunningInstances(ctx)
   318  	if err != nil {
   319  		log.Printf("failed to query for instances: %s", err)
   320  		return
   321  	}
   322  	deleteInsts := make([]string, 0, len(insts))
   323  	for _, inst := range insts {
   324  		if !isBuildlet(inst.Name) {
   325  			// Non-buildlets have not been created by the EC2 buildlet pool. Their lifecycle
   326  			// should not be managed by the pool.
   327  			log.Printf("destroyUntrackedInstances: skipping non-buildlet %q", inst.Name)
   328  			continue
   329  		}
   330  		if eb.isRemoteBuildlet(inst.Name) {
   331  			// Remote buildlets have their own expiration mechanism that respects active SSH sessions.
   332  			log.Printf("destroyUntrackedInstances: skipping remote buildlet %q", inst.Name)
   333  			continue
   334  		}
   335  		if id := eb.ledger.InstanceID(inst.Name); id != "" {
   336  			continue
   337  		}
   338  		deleteInsts = append(deleteInsts, inst.ID)
   339  		log.Printf("queued for deleting untracked EC2 VM %q with id %q", inst.Name, inst.ID)
   340  	}
   341  	if len(deleteInsts) == 0 {
   342  		return
   343  	}
   344  	if err := eb.awsClient.DestroyInstances(ctx, deleteInsts...); err != nil {
   345  		log.Printf("failed cleaning EC2 VMs: %s", err)
   346  	}
   347  }