golang.org/x/build@v0.0.0-20240506185731-218518f32b70/internal/coordinator/pool/ec2.go (about) 1 // Copyright 2020 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build linux || darwin 6 7 package pool 8 9 import ( 10 "context" 11 "errors" 12 "fmt" 13 "html" 14 "io" 15 "log" 16 "sync" 17 "time" 18 19 "golang.org/x/build/buildenv" 20 "golang.org/x/build/buildlet" 21 "golang.org/x/build/dashboard" 22 "golang.org/x/build/internal" 23 "golang.org/x/build/internal/cloud" 24 "golang.org/x/build/internal/coordinator/pool/queue" 25 "golang.org/x/build/internal/spanlog" 26 ) 27 28 var _ Buildlet = (*EC2Buildlet)(nil) 29 30 // ec2Buildlet is the package level buildlet pool. 31 // 32 // TODO(golang.org/issues/38337) remove once a package level variable is no longer 33 // required by the main package. 34 var ec2Buildlet *EC2Buildlet 35 36 // EC2BuildetPool retrieves the package level EC2Buildlet pool set by the constructor. 37 // 38 // TODO(golang.org/issues/38337) remove once a package level variable is no longer 39 // required by the main package. 40 func EC2BuildetPool() *EC2Buildlet { 41 return ec2Buildlet 42 } 43 44 func init() { 45 // initializes a basic package level ec2Buildlet pool to enable basic testing in other 46 // packages. 47 // 48 // TODO(golang.org/issues/38337) remove once a package level variable is no longer 49 // required by the main package. 50 ec2Buildlet = &EC2Buildlet{ 51 ledger: newLedger(), 52 } 53 } 54 55 // awsClient represents the aws client used to interact with AWS. This is a partial 56 // implementation of pool.AWSClient. 57 type awsClient interface { 58 DestroyInstances(ctx context.Context, instIDs ...string) error 59 Quota(ctx context.Context, service, code string) (int64, error) 60 InstanceTypesARM(ctx context.Context) ([]*cloud.InstanceType, error) 61 RunningInstances(ctx context.Context) ([]*cloud.Instance, error) 62 } 63 64 // EC2Opt is optional configuration for the buildlet. 65 type EC2Opt func(*EC2Buildlet) 66 67 // EC2Buildlet manages a pool of AWS EC2 buildlets. 68 type EC2Buildlet struct { 69 // awsClient is the client used to interact with AWS services. 70 awsClient awsClient 71 // buildEnv contains the build environment settings. 72 buildEnv *buildenv.Environment 73 // buildletClient is the client used to create a buildlet. 74 buildletClient ec2BuildletClient 75 // hosts provides the host configuration for all hosts. It is passed in to facilitate 76 // testing. 77 hosts map[string]*dashboard.HostConfig 78 // isRemoteBuildletFunc informs the caller is a VM instance is being used as a remote 79 // buildlet. 80 // 81 // TODO(golang.org/issues/38337) remove once we find a way to pass in remote buildlet 82 // information at the get buidlet request. 83 isRemoteBuildlet IsRemoteBuildletFunc 84 // ledger tracks instances and their resource allocations. 85 ledger *ledger 86 // cancelPoll will signal to the pollers to discontinue polling. 87 cancelPoll context.CancelFunc 88 // pollWait waits for all pollers to terminate polling. 89 pollWait sync.WaitGroup 90 } 91 92 // ec2BuildletClient represents an EC2 buildlet client in the buildlet package. 93 type ec2BuildletClient interface { 94 StartNewVM(ctx context.Context, buildEnv *buildenv.Environment, hconf *dashboard.HostConfig, vmName, hostType string, opts *buildlet.VMOpts) (buildlet.Client, error) 95 } 96 97 // NewEC2Buildlet creates a new EC2 buildlet pool used to create and manage the lifecycle of 98 // EC2 buildlets. Information about ARM64 instance types is retrieved before starting the pool. 99 // EC2 quota types are also retrieved before starting the pool. The pool will continuously poll 100 // for quotas which limit the resources that can be consumed by the pool. It will also periodically 101 // search for VMs which are no longer in use or are untracked by the pool in order to delete them. 102 func NewEC2Buildlet(client *cloud.AWSClient, buildEnv *buildenv.Environment, hosts map[string]*dashboard.HostConfig, fn IsRemoteBuildletFunc, opts ...EC2Opt) (*EC2Buildlet, error) { 103 if fn == nil { 104 return nil, errors.New("remote buildlet check function is not set") 105 } 106 ctx, cancel := context.WithCancel(context.Background()) 107 b := &EC2Buildlet{ 108 awsClient: client, 109 buildEnv: buildEnv, 110 buildletClient: buildlet.NewEC2Client(client), 111 cancelPoll: cancel, 112 hosts: hosts, 113 isRemoteBuildlet: fn, 114 ledger: newLedger(), 115 } 116 for _, opt := range opts { 117 opt(b) 118 } 119 if err := b.retrieveAndSetQuota(ctx); err != nil { 120 return nil, fmt.Errorf("unable to create EC2 pool: %w", err) 121 } 122 if err := b.retrieveAndSetInstanceTypes(); err != nil { 123 return nil, fmt.Errorf("unable to create EC2 pool: %w", err) 124 } 125 126 b.pollWait.Add(1) 127 // polls for the EC2 quota data and sets the quota data in 128 // the ledger. When the context has been cancelled, the polling will stop. 129 go func() { 130 go internal.PeriodicallyDo(ctx, time.Hour, func(ctx context.Context, _ time.Time) { 131 log.Printf("retrieveing EC2 quota") 132 _ = b.retrieveAndSetQuota(ctx) 133 }) 134 b.pollWait.Done() 135 }() 136 137 b.pollWait.Add(1) 138 // poll queries for VMs which are not tracked in the ledger and 139 // deletes them. When the context has been cancelled, the polling will stop. 140 go func() { 141 go internal.PeriodicallyDo(ctx, 2*time.Minute, func(ctx context.Context, _ time.Time) { 142 log.Printf("cleaning up unused EC2 instances") 143 b.destroyUntrackedInstances(ctx) 144 }) 145 b.pollWait.Done() 146 }() 147 148 // TODO(golang.org/issues/38337) remove once a package level variable is no longer 149 // required by the main package. 150 ec2Buildlet = b 151 return b, nil 152 } 153 154 // GetBuildlet retrieves a buildlet client for a newly created buildlet. 155 func (eb *EC2Buildlet) GetBuildlet(ctx context.Context, hostType string, lg Logger, si *queue.SchedItem) (buildlet.Client, error) { 156 hconf, ok := eb.hosts[hostType] 157 if !ok { 158 return nil, fmt.Errorf("ec2 pool: unknown host type %q", hostType) 159 } 160 instName := instanceName(hostType, 7) 161 log.Printf("Creating EC2 VM %q for %s", instName, hostType) 162 kp, err := buildlet.NewKeyPair() 163 if err != nil { 164 log.Printf("failed to create TLS key pair for %s: %s", hostType, err) 165 return nil, fmt.Errorf("failed to create TLS key pair: %w", err) 166 } 167 168 qsp := lg.CreateSpan("awaiting_ec2_quota") 169 err = eb.ledger.ReserveResources(ctx, instName, hconf.MachineType(), si) 170 qsp.Done(err) 171 if err != nil { 172 return nil, err 173 } 174 175 ec2BuildletSpan := lg.CreateSpan("create_ec2_buildlet", instName) 176 defer func() { ec2BuildletSpan.Done(err) }() 177 178 var ( 179 createSpan = lg.CreateSpan("create_ec2_instance", instName) 180 waitBuildlet spanlog.Span 181 curSpan = createSpan 182 instanceCreated bool 183 ) 184 bc, err := eb.buildletClient.StartNewVM(ctx, eb.buildEnv, hconf, instName, hostType, &buildlet.VMOpts{ 185 Zone: "", // allow the EC2 api pick an availability zone with capacity 186 TLS: kp, 187 Meta: make(map[string]string), 188 DeleteIn: determineDeleteTimeout(hconf), 189 OnInstanceRequested: func() { 190 log.Printf("EC2 VM %q now booting", instName) 191 }, 192 OnInstanceCreated: func() { 193 log.Printf("EC2 VM %q now running", instName) 194 createSpan.Done(nil) 195 instanceCreated = true 196 waitBuildlet = lg.CreateSpan("wait_buildlet_start", instName) 197 curSpan = waitBuildlet 198 }, 199 OnGotEC2InstanceInfo: func(inst *cloud.Instance) { 200 lg.LogEventTime("got_instance_info", "waiting_for_buildlet...") 201 eb.ledger.UpdateReservation(instName, inst.ID) 202 }, 203 }) 204 if err != nil { 205 curSpan.Done(err) 206 log.Printf("EC2 VM creation failed for %s: %v", hostType, err) 207 if instanceCreated { 208 log.Printf("EC2 VM %q failed initialize buildlet client. deleting...", instName) 209 eb.buildletDone(instName) 210 } else { 211 eb.ledger.Remove(instName) 212 } 213 return nil, err 214 } 215 waitBuildlet.Done(nil) 216 bc.SetDescription(fmt.Sprintf("EC2 VM: %s", instName)) 217 bc.SetOnHeartbeatFailure(func() { 218 log.Printf("EC2 VM %q failed heartbeat", instName) 219 eb.buildletDone(instName) 220 }) 221 bc.SetInstanceName(instName) 222 return bc, nil 223 } 224 225 func (eb *EC2Buildlet) QuotaStats() map[string]*queue.QuotaStats { 226 return map[string]*queue.QuotaStats{ 227 "ec2-cpu": eb.ledger.cpuQueue.ToExported(), 228 } 229 } 230 231 // String gives a report of capacity usage for the EC2 buildlet pool. 232 func (eb *EC2Buildlet) String() string { 233 return fmt.Sprintf("EC2 pool capacity: %s", eb.capacityString()) 234 } 235 236 // capacityString() gives a report of capacity usage. 237 func (eb *EC2Buildlet) capacityString() string { 238 r := eb.ledger.Resources() 239 return fmt.Sprintf("%d instances; %d/%d CPUs", r.InstCount, r.CPUUsed, r.CPULimit) 240 } 241 242 // WriteHTMLStatus writes the status of the EC2 buildlet pool to an io.Writer. 243 func (eb *EC2Buildlet) WriteHTMLStatus(w io.Writer) { 244 fmt.Fprintf(w, "<b>EC2 pool</b> capacity: %s", eb.capacityString()) 245 246 active := eb.ledger.ResourceTime() 247 if len(active) > 0 { 248 fmt.Fprintf(w, "<ul>") 249 for _, inst := range active { 250 fmt.Fprintf(w, "<li>%v, %s</li>\n", html.EscapeString(inst.Name), friendlyDuration(time.Since(inst.Creation))) 251 } 252 fmt.Fprintf(w, "</ul>") 253 } 254 } 255 256 // buildletDone issues a call to destroy the EC2 instance and removes 257 // the instance from the ledger. Removing the instance from the ledger 258 // also releases any resources allocated to that instance. If an instance 259 // is not found in the ledger or on EC2 then an error is logged. All 260 // untracked instances will be cleaned up by the polling cleanupUnusedVMs 261 // method. 262 func (eb *EC2Buildlet) buildletDone(instName string) { 263 vmID := eb.ledger.InstanceID(instName) 264 if vmID == "" { 265 log.Printf("EC2 vm %s not found", instName) 266 return 267 } 268 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 269 defer cancel() 270 if err := eb.awsClient.DestroyInstances(ctx, vmID); err != nil { 271 log.Printf("EC2 VM %s deletion failed: %s", instName, err) 272 } 273 eb.ledger.Remove(instName) 274 } 275 276 // Close stops the pollers used by the EC2Buildlet pool from running. 277 func (eb *EC2Buildlet) Close() { 278 eb.cancelPoll() 279 eb.pollWait.Wait() 280 } 281 282 // retrieveAndSetQuota queries EC2 for account relevant quotas and sets the quota in the ledger. 283 func (eb *EC2Buildlet) retrieveAndSetQuota(ctx context.Context) error { 284 ctx, cancel := context.WithTimeout(ctx, 10*time.Second) 285 defer cancel() 286 287 cpuQuota, err := eb.awsClient.Quota(ctx, cloud.QuotaServiceEC2, cloud.QuotaCodeCPUOnDemand) 288 if err != nil { 289 log.Printf("unable to query for EC2 cpu quota: %s", err) 290 return err 291 } 292 eb.ledger.SetCPULimit(cpuQuota) 293 return nil 294 } 295 296 // retrieveAndSetInstanceTypes retrieves the ARM64 instance types from the EC2 297 // service and sets them in the ledger. 298 func (eb *EC2Buildlet) retrieveAndSetInstanceTypes() error { 299 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 300 defer cancel() 301 302 its, err := eb.awsClient.InstanceTypesARM(ctx) 303 if err != nil { 304 return fmt.Errorf("unable to retrieve EC2 instance types: %w", err) 305 } 306 eb.ledger.UpdateInstanceTypes(its) 307 log.Printf("ec2 buildlet pool instance types updated") 308 return nil 309 } 310 311 // destroyUntrackedInstances searches for VMs which exist but are not being tracked in the 312 // ledger and deletes them. 313 func (eb *EC2Buildlet) destroyUntrackedInstances(ctx context.Context) { 314 ctx, cancel := context.WithTimeout(ctx, 30*time.Second) 315 defer cancel() 316 317 insts, err := eb.awsClient.RunningInstances(ctx) 318 if err != nil { 319 log.Printf("failed to query for instances: %s", err) 320 return 321 } 322 deleteInsts := make([]string, 0, len(insts)) 323 for _, inst := range insts { 324 if !isBuildlet(inst.Name) { 325 // Non-buildlets have not been created by the EC2 buildlet pool. Their lifecycle 326 // should not be managed by the pool. 327 log.Printf("destroyUntrackedInstances: skipping non-buildlet %q", inst.Name) 328 continue 329 } 330 if eb.isRemoteBuildlet(inst.Name) { 331 // Remote buildlets have their own expiration mechanism that respects active SSH sessions. 332 log.Printf("destroyUntrackedInstances: skipping remote buildlet %q", inst.Name) 333 continue 334 } 335 if id := eb.ledger.InstanceID(inst.Name); id != "" { 336 continue 337 } 338 deleteInsts = append(deleteInsts, inst.ID) 339 log.Printf("queued for deleting untracked EC2 VM %q with id %q", inst.Name, inst.ID) 340 } 341 if len(deleteInsts) == 0 { 342 return 343 } 344 if err := eb.awsClient.DestroyInstances(ctx, deleteInsts...); err != nil { 345 log.Printf("failed cleaning EC2 VMs: %s", err) 346 } 347 }