github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/worker/provisioner/provisioner_task.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package provisioner 5 6 import ( 7 stdcontext "context" 8 "fmt" 9 "math/rand" 10 "sort" 11 "sync" 12 "time" 13 14 "github.com/juju/collections/set" 15 "github.com/juju/errors" 16 "github.com/juju/names/v5" 17 "github.com/juju/utils/v3" 18 "github.com/juju/version/v2" 19 "github.com/juju/worker/v3" 20 "github.com/juju/worker/v3/catacomb" 21 22 apiprovisioner "github.com/juju/juju/api/agent/provisioner" 23 "github.com/juju/juju/cloudconfig/instancecfg" 24 "github.com/juju/juju/container" 25 "github.com/juju/juju/controller" 26 "github.com/juju/juju/controller/authentication" 27 "github.com/juju/juju/core/arch" 28 corebase "github.com/juju/juju/core/base" 29 "github.com/juju/juju/core/constraints" 30 "github.com/juju/juju/core/instance" 31 "github.com/juju/juju/core/life" 32 "github.com/juju/juju/core/lxdprofile" 33 "github.com/juju/juju/core/network" 34 "github.com/juju/juju/core/status" 35 "github.com/juju/juju/core/watcher" 36 "github.com/juju/juju/core/workerpool" 37 "github.com/juju/juju/environs" 38 "github.com/juju/juju/environs/config" 39 "github.com/juju/juju/environs/context" 40 "github.com/juju/juju/environs/imagemetadata" 41 "github.com/juju/juju/environs/instances" 42 "github.com/juju/juju/environs/simplestreams" 43 providercommon "github.com/juju/juju/provider/common" 44 "github.com/juju/juju/rpc/params" 45 "github.com/juju/juju/storage" 46 coretools "github.com/juju/juju/tools" 47 "github.com/juju/juju/worker/common" 48 "github.com/juju/juju/wrench" 49 ) 50 51 type ProvisionerTask interface { 52 worker.Worker 53 54 // SetHarvestMode sets a flag to indicate how the provisioner task 55 // should harvest machines. See config.HarvestMode for 56 // documentation of behavior. 57 SetHarvestMode(mode config.HarvestMode) 58 59 // SetNumProvisionWorkers resizes the pool of provision workers. 60 SetNumProvisionWorkers(numWorkers int) 61 } 62 63 // TaskAPI describes API methods required by a ProvisionerTask. 64 type TaskAPI interface { 65 Machines(...names.MachineTag) ([]apiprovisioner.MachineResult, error) 66 MachinesWithTransientErrors() ([]apiprovisioner.MachineStatusResult, error) 67 ProvisioningInfo(machineTags []names.MachineTag) (params.ProvisioningInfoResults, error) 68 } 69 70 type DistributionGroupFinder interface { 71 DistributionGroupByMachineId(...names.MachineTag) ([]apiprovisioner.DistributionGroupResult, error) 72 } 73 74 // ToolsFinder is an interface used for finding tools to run on 75 // provisioned instances. 76 type ToolsFinder interface { 77 // FindTools returns a list of tools matching the specified 78 // version, os, and architecture. If arch is empty, the 79 // implementation is expected to use a well documented default. 80 FindTools(version version.Number, os string, arch string) (coretools.List, error) 81 } 82 83 // TaskConfig holds the initialisation data for a ProvisionerTask instance. 84 type TaskConfig struct { 85 ControllerUUID string 86 HostTag names.Tag 87 Logger Logger 88 HarvestMode config.HarvestMode 89 TaskAPI TaskAPI 90 DistributionGroupFinder DistributionGroupFinder 91 ToolsFinder ToolsFinder 92 MachineWatcher watcher.StringsWatcher 93 RetryWatcher watcher.NotifyWatcher 94 Broker environs.InstanceBroker 95 Auth authentication.AuthenticationProvider 96 ImageStream string 97 RetryStartInstanceStrategy RetryStrategy 98 CloudCallContextFunc common.CloudCallContextFunc 99 NumProvisionWorkers int 100 EventProcessedCb func(string) 101 } 102 103 func NewProvisionerTask(cfg TaskConfig) (ProvisionerTask, error) { 104 machineChanges := cfg.MachineWatcher.Changes() 105 workers := []worker.Worker{cfg.MachineWatcher} 106 var retryChanges watcher.NotifyChannel 107 if cfg.RetryWatcher != nil { 108 retryChanges = cfg.RetryWatcher.Changes() 109 workers = append(workers, cfg.RetryWatcher) 110 } 111 task := &provisionerTask{ 112 controllerUUID: cfg.ControllerUUID, 113 hostTag: cfg.HostTag, 114 logger: cfg.Logger, 115 taskAPI: cfg.TaskAPI, 116 distributionGroupFinder: cfg.DistributionGroupFinder, 117 toolsFinder: cfg.ToolsFinder, 118 machineChanges: machineChanges, 119 retryChanges: retryChanges, 120 broker: cfg.Broker, 121 auth: cfg.Auth, 122 harvestMode: cfg.HarvestMode, 123 harvestModeChan: make(chan config.HarvestMode, 1), 124 machines: make(map[string]apiprovisioner.MachineProvisioner), 125 machinesStarting: make(map[string]bool), 126 machinesStopDeferred: make(map[string]bool), 127 machinesStopping: make(map[string]bool), 128 availabilityZoneMachines: make([]*AvailabilityZoneMachine, 0), 129 imageStream: cfg.ImageStream, 130 retryStartInstanceStrategy: cfg.RetryStartInstanceStrategy, 131 cloudCallCtxFunc: cfg.CloudCallContextFunc, 132 wp: workerpool.NewWorkerPool(cfg.Logger, cfg.NumProvisionWorkers), 133 wpSizeChan: make(chan int, 1), 134 eventProcessedCb: cfg.EventProcessedCb, 135 } 136 err := catacomb.Invoke(catacomb.Plan{ 137 Site: &task.catacomb, 138 Work: task.loop, 139 Init: workers, 140 }) 141 if err != nil { 142 return nil, errors.Trace(err) 143 } 144 return task, nil 145 } 146 147 // The list of events that are passed into the eventProcessed callback by the 148 // main loop. 149 const ( 150 eventTypeProcessedMachines = "processed-machines" 151 eventTypeRetriedMachinesWithErrors = "retried-machines-with-errors" 152 eventTypeResizedWorkerPool = "resized-worker-pool" 153 eventTypeHarvestModeChanged = "harvest-mode-changed" 154 ) 155 156 type provisionerTask struct { 157 controllerUUID string 158 hostTag names.Tag 159 logger Logger 160 taskAPI TaskAPI 161 distributionGroupFinder DistributionGroupFinder 162 toolsFinder ToolsFinder 163 machineChanges watcher.StringsChannel 164 retryChanges watcher.NotifyChannel 165 broker environs.InstanceBroker 166 catacomb catacomb.Catacomb 167 auth authentication.AuthenticationProvider 168 imageStream string 169 harvestMode config.HarvestMode 170 harvestModeChan chan config.HarvestMode 171 retryStartInstanceStrategy RetryStrategy 172 173 machinesMutex sync.RWMutex 174 machines map[string]apiprovisioner.MachineProvisioner // machine ID -> machine 175 machinesStarting map[string]bool // machine IDs currently being started. 176 machinesStopping map[string]bool // machine IDs currently being stopped. 177 machinesStopDeferred map[string]bool // machine IDs which were set as dead while starting. They will be stopped once they are online. 178 availabilityZoneMachines []*AvailabilityZoneMachine 179 instances map[instance.Id]instances.Instance // instanceID -> instance 180 cloudCallCtxFunc common.CloudCallContextFunc 181 182 // A worker pool for starting/stopping instances in parallel. 183 wp *workerpool.WorkerPool 184 wpSizeChan chan int 185 186 // eventProcessedCb is an optional, externally-registered callback that 187 // will be invoked when the task main loop successfully processes an event. 188 // The event type is provided as the first arg to the callback. 189 eventProcessedCb func(string) 190 } 191 192 // Kill implements worker.Worker.Kill. 193 func (task *provisionerTask) Kill() { 194 task.catacomb.Kill(nil) 195 } 196 197 // Wait implements worker.Worker.Wait. 198 func (task *provisionerTask) Wait() error { 199 return task.catacomb.Wait() 200 } 201 202 func (task *provisionerTask) loop() (taskErr error) { 203 task.logger.Infof("entering provisioner task loop; using provisioner pool with %d workers", task.wp.Size()) 204 defer func() { 205 wpErr := task.wp.Close() 206 if taskErr == nil { 207 taskErr = wpErr 208 } 209 task.logger.Infof("exiting provisioner task loop; err: %v", taskErr) 210 }() 211 212 // Don't allow the harvesting mode to change until we have read at 213 // least one set of changes, which will populate the task.machines 214 // map. Otherwise we will potentially see all legitimate instances 215 // as unknown. 216 var harvestModeChan chan config.HarvestMode 217 218 // When the watcher is started, it will have the initial changes be all 219 // the machines that are relevant. Also, since this is available straight 220 // away, we know there will be some changes right off the bat. 221 ctx := task.cloudCallCtxFunc(stdcontext.Background()) 222 for { 223 select { 224 case ids, ok := <-task.machineChanges: 225 if !ok { 226 return errors.New("machine watcher closed channel") 227 } 228 229 if err := task.processMachines(ctx, ids); err != nil { 230 return errors.Annotate(err, "processing updated machines") 231 } 232 233 task.notifyEventProcessedCallback(eventTypeProcessedMachines) 234 235 // We've seen a set of changes. 236 // Enable modification of harvesting mode. 237 harvestModeChan = task.harvestModeChan 238 case numWorkers := <-task.wpSizeChan: 239 if task.wp.Size() == numWorkers { 240 continue // nothing to do 241 } 242 243 // Stop the current pool (checking for any pending 244 // errors) and create a new one. 245 task.logger.Infof("resizing provision worker pool size to %d", numWorkers) 246 if err := task.wp.Close(); err != nil { 247 return err 248 } 249 task.wp = workerpool.NewWorkerPool(task.logger, numWorkers) 250 task.notifyEventProcessedCallback(eventTypeResizedWorkerPool) 251 case harvestMode := <-harvestModeChan: 252 if harvestMode == task.harvestMode { 253 break 254 } 255 task.logger.Infof("harvesting mode changed to %s", harvestMode) 256 task.harvestMode = harvestMode 257 task.notifyEventProcessedCallback(eventTypeHarvestModeChanged) 258 if harvestMode.HarvestUnknown() { 259 task.logger.Infof("harvesting unknown machines") 260 if err := task.processMachines(ctx, nil); err != nil { 261 return errors.Annotate(err, "processing machines after safe mode disabled") 262 } 263 task.notifyEventProcessedCallback(eventTypeProcessedMachines) 264 } 265 case <-task.retryChanges: 266 if err := task.processMachinesWithTransientErrors(ctx); err != nil { 267 return errors.Annotate(err, "processing machines with transient errors") 268 } 269 task.notifyEventProcessedCallback(eventTypeRetriedMachinesWithErrors) 270 case <-task.wp.Done(): 271 // The worker pool has detected one or more errors and 272 // is in the process of shutting down. Collect and 273 // report any emitted errors. 274 return task.wp.Close() 275 case <-task.catacomb.Dying(): 276 return task.catacomb.ErrDying() 277 } 278 } 279 } 280 281 func (task *provisionerTask) notifyEventProcessedCallback(evtType string) { 282 if task.eventProcessedCb != nil { 283 task.eventProcessedCb(evtType) 284 } 285 } 286 287 // SetHarvestMode implements ProvisionerTask.SetHarvestMode(). 288 func (task *provisionerTask) SetHarvestMode(mode config.HarvestMode) { 289 select { 290 case task.harvestModeChan <- mode: 291 case <-task.catacomb.Dying(): 292 } 293 } 294 295 // SetNumProvisionWorkers queues a pool resize request to be processed by the 296 // provisioner task main loop. 297 func (task *provisionerTask) SetNumProvisionWorkers(numWorkers int) { 298 select { 299 case task.wpSizeChan <- numWorkers: 300 case <-task.catacomb.Dying(): 301 } 302 } 303 304 func (task *provisionerTask) processMachinesWithTransientErrors(ctx context.ProviderCallContext) error { 305 results, err := task.taskAPI.MachinesWithTransientErrors() 306 if err != nil || len(results) == 0 { 307 return nil 308 } 309 task.logger.Tracef("processMachinesWithTransientErrors(%v)", results) 310 var pending []apiprovisioner.MachineProvisioner 311 for _, result := range results { 312 if result.Status.Error != nil { 313 task.logger.Errorf("cannot retry provisioning of machine %q: %v", result.Machine.Id(), result.Status.Error) 314 continue 315 } 316 machine := result.Machine 317 if err := machine.SetStatus(status.Pending, "", nil); err != nil { 318 task.logger.Errorf("cannot reset status of machine %q: %v", machine.Id(), err) 319 continue 320 } 321 if err := machine.SetInstanceStatus(status.Provisioning, "", nil); err != nil { 322 task.logger.Errorf("cannot reset instance status of machine %q: %v", machine.Id(), err) 323 continue 324 } 325 if err := machine.SetModificationStatus(status.Idle, "", nil); err != nil { 326 task.logger.Errorf("cannot reset modification status of machine %q: %v", machine.Id(), err) 327 continue 328 } 329 task.machinesMutex.Lock() 330 task.machines[machine.Tag().String()] = machine 331 task.machinesMutex.Unlock() 332 pending = append(pending, machine) 333 } 334 return task.queueStartMachines(ctx, pending) 335 } 336 337 func (task *provisionerTask) processMachines(ctx context.ProviderCallContext, ids []string) error { 338 task.logger.Tracef("processMachines(%v)", ids) 339 340 // Populate the tasks maps of current instances and machines. 341 if err := task.populateMachineMaps(ctx, ids); err != nil { 342 return errors.Trace(err) 343 } 344 345 // Maintain zone-machine distributions. 346 err := task.updateAvailabilityZoneMachines(ctx) 347 if err != nil && !errors.IsNotImplemented(err) { 348 return errors.Annotate(err, "updating AZ distributions") 349 } 350 351 // Find machines without an instance ID or that are dead. 352 pending, dead, err := task.pendingOrDead(ids) 353 if err != nil { 354 return errors.Trace(err) 355 } 356 357 // Queue removal of any dead machines that are not already being 358 // stopped or flagged for deferred stopping once they are online. 359 if err := task.filterAndQueueRemovalOfDeadMachines(ctx, dead); err != nil { 360 return errors.Trace(err) 361 } 362 363 // Queue start requests for any other pending instances. 364 return errors.Trace(task.queueStartMachines(ctx, pending)) 365 } 366 367 func instanceIds(instances []instances.Instance) []string { 368 ids := make([]string, 0, len(instances)) 369 for _, inst := range instances { 370 ids = append(ids, string(inst.Id())) 371 } 372 return ids 373 } 374 375 // populateMachineMaps updates task.instances. Also updates task.machines map 376 // if a list of IDs is given. 377 func (task *provisionerTask) populateMachineMaps(ctx context.ProviderCallContext, ids []string) error { 378 allInstances, err := task.broker.AllRunningInstances(ctx) 379 if err != nil { 380 return errors.Annotate(err, "getting all instances from broker") 381 } 382 383 instances := make(map[instance.Id]instances.Instance) 384 for _, i := range allInstances { 385 instances[i.Id()] = i 386 } 387 task.machinesMutex.Lock() 388 task.instances = instances 389 task.machinesMutex.Unlock() 390 391 // Update the machines map with new data for each of the machines in the 392 // change list. 393 machineTags := make([]names.MachineTag, len(ids)) 394 for i, id := range ids { 395 machineTags[i] = names.NewMachineTag(id) 396 } 397 machines, err := task.taskAPI.Machines(machineTags...) 398 if err != nil { 399 return errors.Annotatef(err, "getting machines %v", ids) 400 } 401 task.machinesMutex.Lock() 402 defer task.machinesMutex.Unlock() 403 for i, result := range machines { 404 switch { 405 case result.Err == nil: 406 task.machines[result.Machine.Id()] = result.Machine 407 case params.IsCodeNotFoundOrCodeUnauthorized(result.Err): 408 task.logger.Debugf("machine %q not found in state", ids[i]) 409 delete(task.machines, ids[i]) 410 default: 411 return errors.Annotatef(result.Err, "getting machine %v", ids[i]) 412 } 413 } 414 return nil 415 } 416 417 // pendingOrDead looks up machines with ids and returns those that do not 418 // have an instance id assigned yet, and also those that are dead. Any machines 419 // that are currently being stopped or have been marked for deferred stopping 420 // once they are online will be skipped. 421 func (task *provisionerTask) pendingOrDead( 422 ids []string, 423 ) (pending, dead []apiprovisioner.MachineProvisioner, err error) { 424 task.machinesMutex.RLock() 425 defer task.machinesMutex.RUnlock() 426 for _, id := range ids { 427 // Ignore machines that have been either queued for deferred 428 // stopping or they are currently stopping 429 if _, found := task.machinesStopDeferred[id]; found { 430 task.logger.Tracef("pendingOrDead: ignoring machine %q; machine has deferred stop flag set", id) 431 continue // ignore: will be stopped once started 432 } else if _, found := task.machinesStopping[id]; found { 433 task.logger.Tracef("pendingOrDead: ignoring machine %q; machine is currently being stopped", id) 434 continue // ignore: currently being stopped. 435 } 436 437 machine, found := task.machines[id] 438 if !found { 439 task.logger.Infof("machine %q not found", id) 440 continue 441 } 442 var classification MachineClassification 443 classification, err = classifyMachine(task.logger, machine) 444 if err != nil { 445 return // return the error 446 } 447 switch classification { 448 case Pending: 449 pending = append(pending, machine) 450 case Dead: 451 dead = append(dead, machine) 452 } 453 } 454 task.logger.Tracef("pending machines: %v", pending) 455 task.logger.Tracef("dead machines: %v", dead) 456 return 457 } 458 459 type ClassifiableMachine interface { 460 Life() life.Value 461 InstanceId() (instance.Id, error) 462 EnsureDead() error 463 Status() (status.Status, string, error) 464 InstanceStatus() (status.Status, string, error) 465 Id() string 466 } 467 468 type MachineClassification string 469 470 const ( 471 None MachineClassification = "none" 472 Pending MachineClassification = "Pending" 473 Dead MachineClassification = "Dead" 474 ) 475 476 func classifyMachine(logger Logger, machine ClassifiableMachine) ( 477 MachineClassification, error) { 478 switch machine.Life() { 479 case life.Dying: 480 if _, err := machine.InstanceId(); err == nil { 481 return None, nil 482 } else if !params.IsCodeNotProvisioned(err) { 483 return None, errors.Annotatef(err, "loading dying machine id:%s, details:%v", machine.Id(), machine) 484 } 485 logger.Infof("killing dying, unprovisioned machine %q", machine) 486 if err := machine.EnsureDead(); err != nil { 487 return None, errors.Annotatef(err, "ensuring machine dead id:%s, details:%v", machine.Id(), machine) 488 } 489 fallthrough 490 case life.Dead: 491 return Dead, nil 492 } 493 instId, err := machine.InstanceId() 494 if err != nil { 495 if !params.IsCodeNotProvisioned(err) { 496 return None, errors.Annotatef(err, "loading machine id:%s, details:%v", machine.Id(), machine) 497 } 498 machineStatus, _, err := machine.Status() 499 if err != nil { 500 logger.Infof("cannot get machine id:%s, details:%v, err:%v", machine.Id(), machine, err) 501 return None, nil 502 } 503 if machineStatus == status.Pending { 504 logger.Infof("found machine pending provisioning id:%s, details:%v", machine.Id(), machine) 505 return Pending, nil 506 } 507 instanceStatus, _, err := machine.InstanceStatus() 508 if err != nil { 509 logger.Infof("cannot read instance status id:%s, details:%v, err:%v", machine.Id(), machine, err) 510 return None, nil 511 } 512 if instanceStatus == status.Provisioning { 513 logger.Infof("found machine provisioning id:%s, details:%v", machine.Id(), machine) 514 return Pending, nil 515 } 516 return None, nil 517 } 518 logger.Infof("machine %s already started as instance %q", machine.Id(), instId) 519 520 return None, nil 521 } 522 523 // findUnknownInstances finds instances which are not associated with a machine. 524 func (task *provisionerTask) findUnknownInstances(stopping []instances.Instance) ([]instances.Instance, error) { 525 // Make a copy of the instances we know about. 526 taskInstances := make(map[instance.Id]instances.Instance) 527 for k, v := range task.instances { 528 taskInstances[k] = v 529 } 530 531 task.machinesMutex.RLock() 532 defer task.machinesMutex.RUnlock() 533 for _, m := range task.machines { 534 instId, err := m.InstanceId() 535 switch { 536 case err == nil: 537 delete(taskInstances, instId) 538 case params.IsCodeNotProvisioned(err): 539 case params.IsCodeNotFoundOrCodeUnauthorized(err): 540 default: 541 return nil, err 542 } 543 } 544 // Now remove all those instances that we are stopping already as we 545 // know about those and don't want to include them in the unknown list. 546 for _, inst := range stopping { 547 delete(taskInstances, inst.Id()) 548 } 549 var unknown []instances.Instance 550 for _, inst := range taskInstances { 551 unknown = append(unknown, inst) 552 } 553 return unknown, nil 554 } 555 556 // filterAndQueueRemovalOfDeadMachines scans the list of dead machines and: 557 // - Sets the deferred stop flag for machines that are still online 558 // - Filters out any machines that are either stopping or have the deferred 559 // stop flag set. 560 // - Marks the remaining machines as stopping and queues a request for them to 561 // be cleaned up. 562 func (task *provisionerTask) filterAndQueueRemovalOfDeadMachines(ctx context.ProviderCallContext, dead []apiprovisioner.MachineProvisioner) error { 563 // Flag any machines in the dead list that are still being started so 564 // they will be stopped once they come online. 565 task.deferStopForNotYetStartedMachines(dead) 566 567 // Filter the initial dead machine list. Any machines marked for 568 // deferred stopping, machines that are already being stopped and 569 // machines that have not yet finished provisioning will be removed 570 // from the filtered list. 571 dead = task.filterDeadMachines(dead) 572 573 // The remaining machines will be removed asynchronously and this 574 // method can be invoked again concurrently to process another machine 575 // change event. To avoid attempts to remove the same machines twice, 576 // they are flagged as stopping. 577 task.machinesMutex.Lock() 578 for _, machine := range dead { 579 machID := machine.Id() 580 if !task.machinesStopDeferred[machID] { 581 task.machinesStopping[machID] = true 582 } 583 } 584 task.machinesMutex.Unlock() 585 return task.queueRemovalOfDeadMachines(ctx, dead) 586 } 587 588 func (task *provisionerTask) queueRemovalOfDeadMachines( 589 ctx context.ProviderCallContext, 590 dead []apiprovisioner.MachineProvisioner, 591 ) error { 592 // Collect the instances for all provisioned machines that are dead. 593 stopping := task.instancesForDeadMachines(dead) 594 595 // Find running instances that have no machines associated. 596 unknown, err := task.findUnknownInstances(stopping) 597 if err != nil { 598 return errors.Trace(err) 599 } 600 601 if !task.harvestMode.HarvestUnknown() && len(unknown) != 0 { 602 task.logger.Infof( 603 "%s is set to %s; unknown instances not stopped %v", 604 config.ProvisionerHarvestModeKey, 605 task.harvestMode.String(), 606 instanceIds(unknown), 607 ) 608 unknown = nil 609 } 610 611 if (task.harvestMode.HarvestNone() || !task.harvestMode.HarvestDestroyed()) && len(stopping) != 0 { 612 task.logger.Infof( 613 `%s is set to "%s"; will not harvest %s`, 614 config.ProvisionerHarvestModeKey, 615 task.harvestMode.String(), 616 instanceIds(stopping), 617 ) 618 stopping = nil 619 } 620 621 if len(dead) == 0 { 622 return nil // nothing to do 623 } 624 625 provTask := workerpool.Task{ 626 Type: "stop-instances", 627 Process: func() error { 628 if len(stopping) > 0 { 629 task.logger.Infof("stopping known instances %v", instanceIds(stopping)) 630 } 631 if len(unknown) > 0 { 632 task.logger.Infof("stopping unknown instances %v", instanceIds(unknown)) 633 } 634 635 // It is important that we stop unknown instances before starting 636 // pending ones, because if we start an instance and then fail to 637 // set its InstanceId on the machine. 638 // We don't want to start a new instance for the same machine ID. 639 if err := task.doStopInstances(ctx, append(stopping, unknown...)); err != nil { 640 return errors.Trace(err) 641 } 642 643 // Remove any dead machines from state. 644 for _, machine := range dead { 645 task.logger.Infof("removing dead machine %q", machine.Id()) 646 if err := machine.MarkForRemoval(); err != nil { 647 task.logger.Errorf("failed to remove dead machine %q", machine.Id()) 648 } 649 task.removeMachineFromAZMap(machine) 650 machID := machine.Id() 651 task.machinesMutex.Lock() 652 delete(task.machines, machID) 653 delete(task.machinesStopping, machID) 654 task.machinesMutex.Unlock() 655 } 656 657 return nil 658 }, 659 } 660 661 select { 662 case task.wp.Queue() <- provTask: 663 // successfully enqueued removal request 664 return nil 665 case <-task.catacomb.Dying(): 666 return task.catacomb.ErrDying() 667 case <-task.wp.Done(): 668 // Capture and surface asynchronous worker pool errors. 669 return task.wp.Close() 670 } 671 } 672 673 // Filter the provided dead machines and remove any machines marked for 674 // deferred stopping, machines that are currently being stopped and any 675 // machines that they have not finished starting. 676 func (task *provisionerTask) filterDeadMachines(dead []apiprovisioner.MachineProvisioner) []apiprovisioner.MachineProvisioner { 677 var deadMachines []apiprovisioner.MachineProvisioner 678 679 task.machinesMutex.Lock() 680 for _, machine := range dead { 681 machID := machine.Id() 682 683 // Ignore any machines for which we have either deferred the 684 // stopping of the machine is currently being stopped or they 685 // are still being started. 686 if task.machinesStopDeferred[machID] || task.machinesStopping[machID] || task.machinesStarting[machID] { 687 continue 688 } 689 690 // This machine should be queued for deletion. 691 deadMachines = append(deadMachines, machine) 692 } 693 task.machinesMutex.Unlock() 694 695 return deadMachines 696 } 697 698 // Iterate the list of dead machines and flag the ones that are still being 699 // started so they can be immediately stopped once they come online. 700 func (task *provisionerTask) deferStopForNotYetStartedMachines(dead []apiprovisioner.MachineProvisioner) { 701 task.machinesMutex.Lock() 702 for _, machine := range dead { 703 machID := machine.Id() 704 if task.machinesStarting[machID] { 705 task.machinesStopDeferred[machID] = true 706 } 707 } 708 task.machinesMutex.Unlock() 709 } 710 711 // instancesForDeadMachines returns a list of instances that correspond to 712 // machines with a life of "dead" in state. Missing machines and machines that 713 // have not finished starting are omitted from the list. 714 func (task *provisionerTask) instancesForDeadMachines(dead []apiprovisioner.MachineProvisioner) []instances.Instance { 715 var deadInstances []instances.Instance 716 for _, machine := range dead { 717 // Ignore machines that are still provisioning 718 task.machinesMutex.RLock() 719 if task.machinesStarting[machine.Id()] { 720 task.machinesMutex.RUnlock() 721 continue 722 } 723 task.machinesMutex.RUnlock() 724 725 instId, err := machine.InstanceId() 726 if err == nil { 727 keep, _ := machine.KeepInstance() 728 if keep { 729 task.logger.Debugf("machine %v is dead but keep-instance is true", instId) 730 continue 731 } 732 733 // If the instance is not found we can't stop it. 734 if inst, found := task.instances[instId]; found { 735 deadInstances = append(deadInstances, inst) 736 } 737 } 738 } 739 return deadInstances 740 } 741 742 func (task *provisionerTask) doStopInstances(ctx context.ProviderCallContext, instances []instances.Instance) error { 743 // Although calling StopInstance with an empty slice should produce no change in the 744 // provider, environs like dummy do not consider this a noop. 745 if len(instances) == 0 { 746 return nil 747 } 748 if wrench.IsActive("provisioner", "stop-instances") { 749 return errors.New("wrench in the works") 750 } 751 752 ids := make([]instance.Id, len(instances)) 753 for i, inst := range instances { 754 ids[i] = inst.Id() 755 } 756 if err := task.broker.StopInstances(ctx, ids...); err != nil { 757 return errors.Annotate(err, "stopping instances") 758 } 759 return nil 760 } 761 762 func (task *provisionerTask) constructInstanceConfig( 763 machine apiprovisioner.MachineProvisioner, 764 auth authentication.AuthenticationProvider, 765 pInfo *params.ProvisioningInfo, 766 ) (*instancecfg.InstanceConfig, error) { 767 768 apiInfo, err := auth.SetupAuthentication(machine) 769 if err != nil { 770 return nil, errors.Annotate(err, "setting up authentication") 771 } 772 773 // Generated a nonce for the new instance, with the format: "machine-#:UUID". 774 // The first part is a badge, specifying the tag of the machine the provisioner 775 // is running on, while the second part is a random UUID. 776 uuid, err := utils.NewUUID() 777 if err != nil { 778 return nil, errors.Annotate(err, "generating nonce for machine "+machine.Id()) 779 } 780 781 nonce := fmt.Sprintf("%s:%s", task.hostTag, uuid) 782 base, err := corebase.ParseBase(pInfo.Base.Name, pInfo.Base.Channel) 783 if err != nil { 784 return nil, errors.Annotatef(err, "parsing machine base %q", pInfo.Base) 785 } 786 instanceConfig, err := instancecfg.NewInstanceConfig( 787 names.NewControllerTag(controller.Config(pInfo.ControllerConfig).ControllerUUID()), 788 machine.Id(), 789 nonce, 790 task.imageStream, 791 base, 792 apiInfo, 793 ) 794 if err != nil { 795 return nil, errors.Trace(err) 796 } 797 798 instanceConfig.ControllerConfig = make(map[string]interface{}) 799 for k, v := range pInfo.ControllerConfig { 800 instanceConfig.ControllerConfig[k] = v 801 } 802 803 instanceConfig.Tags = pInfo.Tags 804 if len(pInfo.Jobs) > 0 { 805 instanceConfig.Jobs = pInfo.Jobs 806 } 807 808 if instanceConfig.IsController() { 809 publicKey, err := simplestreams.UserPublicSigningKey() 810 if err != nil { 811 return nil, errors.Trace(err) 812 } 813 instanceConfig.PublicImageSigningKey = publicKey 814 } 815 816 instanceConfig.CloudInitUserData = pInfo.CloudInitUserData 817 818 return instanceConfig, nil 819 } 820 821 func (task *provisionerTask) constructStartInstanceParams( 822 controllerUUID string, 823 machine apiprovisioner.MachineProvisioner, 824 instanceConfig *instancecfg.InstanceConfig, 825 provisioningInfo *params.ProvisioningInfo, 826 possibleTools coretools.List, 827 ) (environs.StartInstanceParams, error) { 828 829 volumes := make([]storage.VolumeParams, len(provisioningInfo.Volumes)) 830 for i, v := range provisioningInfo.Volumes { 831 volumeTag, err := names.ParseVolumeTag(v.VolumeTag) 832 if err != nil { 833 return environs.StartInstanceParams{}, errors.Trace(err) 834 } 835 if v.Attachment == nil { 836 return environs.StartInstanceParams{}, errors.Errorf("volume params missing attachment") 837 } 838 machineTag, err := names.ParseMachineTag(v.Attachment.MachineTag) 839 if err != nil { 840 return environs.StartInstanceParams{}, errors.Trace(err) 841 } 842 if machineTag != machine.Tag() { 843 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params has invalid machine tag") 844 } 845 if v.Attachment.InstanceId != "" { 846 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params specifies instance ID") 847 } 848 volumes[i] = storage.VolumeParams{ 849 Tag: volumeTag, 850 Size: v.Size, 851 Provider: storage.ProviderType(v.Provider), 852 Attributes: v.Attributes, 853 ResourceTags: v.Tags, 854 Attachment: &storage.VolumeAttachmentParams{ 855 AttachmentParams: storage.AttachmentParams{ 856 Machine: machineTag, 857 ReadOnly: v.Attachment.ReadOnly, 858 }, 859 Volume: volumeTag, 860 }, 861 } 862 } 863 volumeAttachments := make([]storage.VolumeAttachmentParams, len(provisioningInfo.VolumeAttachments)) 864 for i, v := range provisioningInfo.VolumeAttachments { 865 volumeTag, err := names.ParseVolumeTag(v.VolumeTag) 866 if err != nil { 867 return environs.StartInstanceParams{}, errors.Trace(err) 868 } 869 machineTag, err := names.ParseMachineTag(v.MachineTag) 870 if err != nil { 871 return environs.StartInstanceParams{}, errors.Trace(err) 872 } 873 if machineTag != machine.Tag() { 874 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params has invalid machine tag") 875 } 876 if v.InstanceId != "" { 877 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params specifies instance ID") 878 } 879 if v.VolumeId == "" { 880 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params does not specify volume ID") 881 } 882 volumeAttachments[i] = storage.VolumeAttachmentParams{ 883 AttachmentParams: storage.AttachmentParams{ 884 Provider: storage.ProviderType(v.Provider), 885 Machine: machineTag, 886 ReadOnly: v.ReadOnly, 887 }, 888 Volume: volumeTag, 889 VolumeId: v.VolumeId, 890 } 891 } 892 893 var endpointBindings map[string]network.Id 894 if len(provisioningInfo.EndpointBindings) != 0 { 895 endpointBindings = make(map[string]network.Id) 896 for endpoint, space := range provisioningInfo.EndpointBindings { 897 endpointBindings[endpoint] = network.Id(space) 898 } 899 } 900 901 possibleImageMetadata := make([]*imagemetadata.ImageMetadata, len(provisioningInfo.ImageMetadata)) 902 for i, metadata := range provisioningInfo.ImageMetadata { 903 possibleImageMetadata[i] = &imagemetadata.ImageMetadata{ 904 Id: metadata.ImageId, 905 Arch: metadata.Arch, 906 RegionAlias: metadata.Region, 907 RegionName: metadata.Region, 908 Storage: metadata.RootStorageType, 909 Stream: metadata.Stream, 910 VirtType: metadata.VirtType, 911 Version: metadata.Version, 912 } 913 } 914 915 startInstanceParams := environs.StartInstanceParams{ 916 ControllerUUID: controllerUUID, 917 Constraints: provisioningInfo.Constraints, 918 Tools: possibleTools, 919 InstanceConfig: instanceConfig, 920 Placement: provisioningInfo.Placement, 921 Volumes: volumes, 922 VolumeAttachments: volumeAttachments, 923 SubnetsToZones: subnetZonesFromNetworkTopology(provisioningInfo.ProvisioningNetworkTopology), 924 EndpointBindings: endpointBindings, 925 ImageMetadata: possibleImageMetadata, 926 StatusCallback: machine.SetInstanceStatus, 927 Abort: task.catacomb.Dying(), 928 CharmLXDProfiles: provisioningInfo.CharmLXDProfiles, 929 } 930 if provisioningInfo.RootDisk != nil { 931 startInstanceParams.RootDisk = &storage.VolumeParams{ 932 Provider: storage.ProviderType(provisioningInfo.RootDisk.Provider), 933 Attributes: provisioningInfo.RootDisk.Attributes, 934 } 935 } 936 937 return startInstanceParams, nil 938 } 939 940 // AvailabilityZoneMachine keeps track a single zone and which machines 941 // are in it, which machines have failed to use it and which machines 942 // shouldn't use it. This data is used to decide on how to distribute 943 // machines across availability zones. 944 // 945 // Exposed for testing. 946 type AvailabilityZoneMachine struct { 947 ZoneName string 948 MachineIds set.Strings 949 FailedMachineIds set.Strings 950 ExcludedMachineIds set.Strings // Don't use these machines in the zone. 951 } 952 953 // MatchesConstraints against an AZ. If the constraints specifies Zones, make sure 954 // this AZ matches a listed ZoneName. 955 func (az *AvailabilityZoneMachine) MatchesConstraints(cons constraints.Value) bool { 956 if !cons.HasZones() { 957 return true 958 } 959 for _, zone := range *cons.Zones { 960 if az.ZoneName == zone { 961 return true 962 } 963 } 964 return false 965 } 966 967 // updateAvailabilityZoneMachines maintains a mapping of AZs to machines 968 // running in each zone. 969 // If the provider does not implement the ZonedEnviron interface, return nil. 970 func (task *provisionerTask) updateAvailabilityZoneMachines(ctx context.ProviderCallContext) error { 971 zonedEnv, ok := task.broker.(providercommon.ZonedEnviron) 972 if !ok { 973 return nil 974 } 975 976 task.machinesMutex.Lock() 977 defer task.machinesMutex.Unlock() 978 979 // Only populate from the provider if we have no data. 980 // Otherwise, just check that we know all the current AZs. 981 if len(task.availabilityZoneMachines) == 0 { 982 if err := task.populateAvailabilityZoneMachines(ctx, zonedEnv); err != nil { 983 return errors.Trace(err) 984 } 985 } else { 986 if err := task.checkProviderAvailabilityZones(ctx, zonedEnv); err != nil { 987 return errors.Trace(err) 988 } 989 } 990 991 zones := make([]string, len(task.availabilityZoneMachines)) 992 for i, azm := range task.availabilityZoneMachines { 993 zones[i] = azm.ZoneName 994 } 995 task.logger.Infof("provisioning in zones: %v", zones) 996 997 return nil 998 } 999 1000 // populateAvailabilityZoneMachines populates the slice, 1001 // availabilityZoneMachines, with each zone and the IDs of 1002 // machines running in that zone, according to the provider. 1003 func (task *provisionerTask) populateAvailabilityZoneMachines( 1004 ctx context.ProviderCallContext, zonedEnv providercommon.ZonedEnviron, 1005 ) error { 1006 availabilityZoneInstances, err := providercommon.AvailabilityZoneAllocations(zonedEnv, ctx, []instance.Id{}) 1007 if err != nil { 1008 return errors.Trace(err) 1009 } 1010 1011 instanceMachines := make(map[instance.Id]string) 1012 for _, machine := range task.machines { 1013 instId, err := machine.InstanceId() 1014 if err != nil { 1015 continue 1016 } 1017 instanceMachines[instId] = machine.Id() 1018 } 1019 1020 // Translate instance IDs to machines IDs to aid distributing 1021 // to-be-created instances across availability zones. 1022 task.availabilityZoneMachines = make([]*AvailabilityZoneMachine, len(availabilityZoneInstances)) 1023 for i, azInstances := range availabilityZoneInstances { 1024 machineIds := set.NewStrings() 1025 for _, instanceId := range azInstances.Instances { 1026 if id, ok := instanceMachines[instanceId]; ok { 1027 machineIds.Add(id) 1028 } 1029 } 1030 task.availabilityZoneMachines[i] = &AvailabilityZoneMachine{ 1031 ZoneName: azInstances.ZoneName, 1032 MachineIds: machineIds, 1033 FailedMachineIds: set.NewStrings(), 1034 ExcludedMachineIds: set.NewStrings(), 1035 } 1036 } 1037 return nil 1038 } 1039 1040 // checkProviderAvailabilityZones queries the known AZs. 1041 // If any are missing from the AZ-machines slice, add them. 1042 // If we have entries that are not known by the provider to be available zones, 1043 // check whether we have machines there. 1044 // If so, log a warning, otherwise we can delete them safely. 1045 func (task *provisionerTask) checkProviderAvailabilityZones( 1046 ctx context.ProviderCallContext, zonedEnv providercommon.ZonedEnviron, 1047 ) error { 1048 azs, err := zonedEnv.AvailabilityZones(ctx) 1049 if err != nil { 1050 return errors.Trace(err) 1051 } 1052 1053 zones := set.NewStrings() 1054 for _, z := range azs { 1055 if z.Available() { 1056 zones.Add(z.Name()) 1057 } 1058 } 1059 1060 // Process all the zones that the provisioner knows about. 1061 newAZMs := task.availabilityZoneMachines[:0] 1062 for _, azm := range task.availabilityZoneMachines { 1063 // Provider has the zone as available, and we know it. All good. 1064 if zones.Contains(azm.ZoneName) { 1065 newAZMs = append(newAZMs, azm) 1066 zones.Remove(azm.ZoneName) 1067 continue 1068 } 1069 1070 // If the zone isn't available, but we think we have machines there, 1071 // play it safe and retain the entry. 1072 if len(azm.MachineIds) > 0 { 1073 task.logger.Warningf("machines %v are in zone %q, which is not available, or not known by the cloud", 1074 azm.MachineIds.Values(), azm.ZoneName) 1075 newAZMs = append(newAZMs, azm) 1076 } 1077 1078 // Fallthrough is for the zone's entry to be dropped. 1079 // We don't retain it for newAZMs. 1080 // The new list is logged by the caller. 1081 } 1082 task.availabilityZoneMachines = newAZMs 1083 1084 // Add any remaining zones to the list. 1085 // Since this method is only called if we have previously populated the 1086 // zone-machines slice, we can't have provisioned machines in the zone yet. 1087 for _, z := range zones.Values() { 1088 task.availabilityZoneMachines = append(task.availabilityZoneMachines, &AvailabilityZoneMachine{ 1089 ZoneName: z, 1090 MachineIds: set.NewStrings(), 1091 FailedMachineIds: set.NewStrings(), 1092 ExcludedMachineIds: set.NewStrings(), 1093 }) 1094 } 1095 return nil 1096 } 1097 1098 // populateDistributionGroupZoneMap returns a zone mapping which only includes 1099 // machines in the same distribution group. This is used to determine where new 1100 // machines in that distribution group should be placed. 1101 func (task *provisionerTask) populateDistributionGroupZoneMap(machineIds []string) []*AvailabilityZoneMachine { 1102 var dgAvailabilityZoneMachines []*AvailabilityZoneMachine 1103 dgSet := set.NewStrings(machineIds...) 1104 for _, azm := range task.availabilityZoneMachines { 1105 dgAvailabilityZoneMachines = append(dgAvailabilityZoneMachines, &AvailabilityZoneMachine{ 1106 azm.ZoneName, 1107 azm.MachineIds.Intersection(dgSet), 1108 azm.FailedMachineIds, 1109 azm.ExcludedMachineIds, 1110 }) 1111 } 1112 return dgAvailabilityZoneMachines 1113 } 1114 1115 // machineAvailabilityZoneDistribution returns a suggested availability zone 1116 // for the specified machine to start in. 1117 // If the current provider does not implement availability zones, "" and no 1118 // error will be returned. 1119 // Machines are spread across availability zones based on lowest population of 1120 // the "available" zones, and any supplied zone constraints. 1121 // Machines in the same DistributionGroup are placed in different zones, 1122 // distributed based on lowest population of machines in that DistributionGroup. 1123 // Machines are not placed in a zone they are excluded from. 1124 // If availability zones are implemented and one isn't found, return NotFound error. 1125 func (task *provisionerTask) machineAvailabilityZoneDistribution( 1126 machineId string, distGroupMachineIds []string, cons constraints.Value, 1127 ) (string, error) { 1128 task.machinesMutex.Lock() 1129 defer task.machinesMutex.Unlock() 1130 1131 if len(task.availabilityZoneMachines) == 0 { 1132 return "", nil 1133 } 1134 1135 // Assign an initial zone to a machine based on lowest population, 1136 // accommodating any supplied zone constraints. 1137 // If the machine has a distribution group, assign based on lowest zone 1138 // population of the distribution group machine. 1139 // If more than one zone has the same number of machines, pick one of those at random. 1140 zoneMachines := task.availabilityZoneMachines 1141 if len(distGroupMachineIds) > 0 { 1142 zoneMachines = task.populateDistributionGroupZoneMap(distGroupMachineIds) 1143 } 1144 1145 // Make a map of zone machines keyed on count. 1146 zoneMap := make(map[int][]*AvailabilityZoneMachine) 1147 for _, zm := range zoneMachines { 1148 machineCount := zm.MachineIds.Size() 1149 zoneMap[machineCount] = append(zoneMap[machineCount], zm) 1150 } 1151 // Sort the counts we have by size so 1152 // we can process starting with the lowest. 1153 var zoneCounts []int 1154 for k := range zoneMap { 1155 zoneCounts = append(zoneCounts, k) 1156 } 1157 sort.Ints(zoneCounts) 1158 1159 var machineZone string 1160 done: 1161 // Starting with the lowest count first, find a suitable AZ. 1162 for _, count := range zoneCounts { 1163 zmList := zoneMap[count] 1164 for len(zmList) > 0 { 1165 // Pick a random AZ to try. 1166 index := rand.Intn(len(zmList)) 1167 zoneMachines := zmList[index] 1168 if !zoneMachines.MatchesConstraints(cons) { 1169 task.logger.Debugf("machine %s does not match az %s: constraints do not match", 1170 machineId, zoneMachines.ZoneName) 1171 } else if zoneMachines.FailedMachineIds.Contains(machineId) { 1172 task.logger.Debugf("machine %s does not match az %s: excluded in failed machine ids", 1173 machineId, zoneMachines.ZoneName) 1174 } else if zoneMachines.ExcludedMachineIds.Contains(machineId) { 1175 task.logger.Debugf("machine %s does not match az %s: excluded machine id", 1176 machineId, zoneMachines.ZoneName) 1177 } else { 1178 // Success, we're out of here. 1179 machineZone = zoneMachines.ZoneName 1180 break done 1181 } 1182 // Zone not suitable so remove it from the list and try the next one. 1183 zmList = append(zmList[:index], zmList[index+1:]...) 1184 } 1185 } 1186 1187 if machineZone == "" { 1188 return machineZone, errors.NotFoundf("suitable availability zone for machine %v", machineId) 1189 } 1190 1191 for _, zoneMachines := range task.availabilityZoneMachines { 1192 if zoneMachines.ZoneName == machineZone { 1193 zoneMachines.MachineIds.Add(machineId) 1194 break 1195 } 1196 } 1197 return machineZone, nil 1198 } 1199 1200 // queueStartMachines resolves the distribution groups for the provided 1201 // machines and enqueues a request for starting each one. If the distribution 1202 // group resolution fails for a particular machine, the method will set the 1203 // machine status and immediately return with an error if that operation fails. 1204 // Any provisioning-related errors are reported asynchronously by the worker 1205 // pool. 1206 func (task *provisionerTask) queueStartMachines(ctx context.ProviderCallContext, machines []apiprovisioner.MachineProvisioner) error { 1207 if len(machines) == 0 { 1208 return nil 1209 } 1210 1211 // Get the distributionGroups for each machine now to avoid 1212 // successive calls to DistributionGroupByMachineId which will 1213 // return the same data. 1214 machineTags := make([]names.MachineTag, len(machines)) 1215 for i, machine := range machines { 1216 machineTags[i] = machine.MachineTag() 1217 } 1218 machineDistributionGroups, err := task.distributionGroupFinder.DistributionGroupByMachineId(machineTags...) 1219 if err != nil { 1220 return errors.Trace(err) 1221 } 1222 1223 // Get all the provisioning info at once, so that we don't make many 1224 // singular requests in parallel to an API that supports batching. 1225 // key the results by machine IDs for retrieval in the loop below. 1226 // We rely here on the API guarantee - that the returned results are 1227 // ordered to correspond to the call arguments. 1228 pInfoResults, err := task.taskAPI.ProvisioningInfo(machineTags) 1229 if err != nil { 1230 return errors.Trace(err) 1231 } 1232 pInfoMap := make(map[string]params.ProvisioningInfoResult, len(pInfoResults.Results)) 1233 for i, tag := range machineTags { 1234 pInfoMap[tag.Id()] = pInfoResults.Results[i] 1235 } 1236 1237 for i, m := range machines { 1238 if machineDistributionGroups[i].Err != nil { 1239 if err := task.setErrorStatus("fetching distribution groups for machine %q: %v", m, machineDistributionGroups[i].Err); err != nil { 1240 return errors.Trace(err) 1241 } 1242 continue 1243 } 1244 1245 // Create and enqueue start instance request. Keep track of 1246 // the pending request so that if a deletion request comes in 1247 // before the machine has completed provisioning we can defer 1248 // it until it does. 1249 task.machinesMutex.Lock() 1250 task.machinesStarting[m.Id()] = true 1251 task.machinesMutex.Unlock() 1252 1253 // Reassign the loop variable to prevent 1254 // overwriting the dispatched references. 1255 machine := m 1256 distGroup := machineDistributionGroups[i].MachineIds 1257 1258 provTask := workerpool.Task{ 1259 Type: fmt.Sprintf("start-instance %s", machine.Id()), 1260 Process: func() error { 1261 machID := machine.Id() 1262 1263 if provisionErr := task.doStartMachine(ctx, machine, distGroup, pInfoMap[machID]); provisionErr != nil { 1264 return provisionErr 1265 } 1266 1267 task.machinesMutex.Lock() 1268 delete(task.machinesStarting, machID) 1269 // If the provisioning succeeded but a deletion 1270 // request has been deferred queue it now. 1271 stopDeferred := task.machinesStopDeferred[machID] 1272 if stopDeferred { 1273 delete(task.machinesStopDeferred, machID) 1274 task.machinesStopping[machID] = true 1275 } 1276 task.machinesMutex.Unlock() 1277 1278 if stopDeferred { 1279 task.logger.Debugf("triggering deferred stop of machine %q", machID) 1280 return task.queueRemovalOfDeadMachines(ctx, []apiprovisioner.MachineProvisioner{ 1281 machine, 1282 }) 1283 } 1284 1285 return nil 1286 }, 1287 } 1288 1289 select { 1290 case task.wp.Queue() <- provTask: 1291 // successfully enqueued provision request 1292 case <-task.catacomb.Dying(): 1293 return task.catacomb.ErrDying() 1294 case <-task.wp.Done(): 1295 // Capture and surface asynchronous worker pool errors. 1296 return task.wp.Close() 1297 } 1298 } 1299 1300 return nil 1301 } 1302 1303 func (task *provisionerTask) setErrorStatus(msg string, machine apiprovisioner.MachineProvisioner, err error) error { 1304 task.logger.Errorf(msg, machine, err) 1305 errForStatus := errors.Cause(err) 1306 if err2 := machine.SetInstanceStatus(status.ProvisioningError, errForStatus.Error(), nil); err2 != nil { 1307 // Something is wrong with this machine, better report it back. 1308 return errors.Annotatef(err2, "setting error status for machine %q", machine) 1309 } 1310 return nil 1311 } 1312 1313 func (task *provisionerTask) doStartMachine( 1314 ctx context.ProviderCallContext, 1315 machine apiprovisioner.MachineProvisioner, 1316 distributionGroupMachineIds []string, 1317 pInfoResult params.ProvisioningInfoResult, 1318 ) (startErr error) { 1319 defer func() { 1320 if startErr == nil { 1321 return 1322 } 1323 1324 // Mask the error if the machine has the deferred stop flag set. 1325 // A stop request will be triggered immediately once this 1326 // method returns. 1327 task.machinesMutex.RLock() 1328 defer task.machinesMutex.RUnlock() 1329 machID := machine.Id() 1330 if task.machinesStopDeferred[machID] { 1331 task.logger.Tracef("doStartMachine: ignoring doStartMachine error (%v) for machine %q; machine has been marked dead while it was being started and has the deferred stop flag set", startErr, machID) 1332 startErr = nil 1333 } 1334 }() 1335 1336 if err := machine.SetInstanceStatus(status.Provisioning, "starting", nil); err != nil { 1337 task.logger.Errorf("%v", err) 1338 } 1339 1340 v, err := machine.ModelAgentVersion() 1341 if err != nil { 1342 return errors.Trace(err) 1343 } 1344 1345 startInstanceParams, err := task.setupToStartMachine(machine, v, pInfoResult) 1346 if err != nil { 1347 return errors.Trace(task.setErrorStatus("%v %v", machine, err)) 1348 } 1349 1350 // Figure out if the zones available to use for a new instance are 1351 // restricted based on placement, and if so exclude those machines 1352 // from being started in any other zone. 1353 if err := task.populateExcludedMachines(ctx, machine.Id(), startInstanceParams); err != nil { 1354 return errors.Trace(err) 1355 } 1356 1357 // TODO ProvisionerParallelization 2017-10-03 1358 // Improve the retry loop, newer methodology 1359 // Is rate limiting handled correctly? 1360 var result *environs.StartInstanceResult 1361 1362 // Attempt creating the instance "retryCount" times. If the provider 1363 // supports availability zones and we're automatically distributing 1364 // across the zones, then we try each zone for every attempt, or until 1365 // one of the StartInstance calls returns an error satisfying 1366 // Is(err, environs.ErrAvailabilityZoneIndependent) 1367 for attemptsLeft := task.retryStartInstanceStrategy.retryCount; attemptsLeft >= 0; { 1368 if startInstanceParams.AvailabilityZone, err = task.machineAvailabilityZoneDistribution( 1369 machine.Id(), distributionGroupMachineIds, startInstanceParams.Constraints, 1370 ); err != nil { 1371 return task.setErrorStatus("cannot start instance for machine %q: %v", machine, err) 1372 } 1373 if startInstanceParams.AvailabilityZone != "" { 1374 task.logger.Infof("trying machine %s StartInstance in availability zone %s", 1375 machine, startInstanceParams.AvailabilityZone) 1376 } 1377 1378 attemptResult, err := task.broker.StartInstance(ctx, startInstanceParams) 1379 if err == nil { 1380 result = attemptResult 1381 break 1382 } else if attemptsLeft <= 0 { 1383 // Set the state to error, so the machine will be skipped 1384 // next time until the error is resolved. 1385 task.removeMachineFromAZMap(machine) 1386 return task.setErrorStatus("cannot start instance for machine %q: %v", machine, err) 1387 } else { 1388 if startInstanceParams.AvailabilityZone != "" { 1389 task.logger.Warningf("machine %s failed to start in availability zone %s: %v", 1390 machine, startInstanceParams.AvailabilityZone, err) 1391 } else { 1392 task.logger.Warningf("machine %s failed to start: %v", machine, err) 1393 } 1394 } 1395 1396 retrying := true 1397 retryMsg := "" 1398 if startInstanceParams.AvailabilityZone != "" && !errors.Is(err, environs.ErrAvailabilityZoneIndependent) { 1399 // We've specified a zone, and the error may be specific to 1400 // that zone. Retry in another zone if there are any untried. 1401 azRemaining, err2 := task.markMachineFailedInAZ(machine, 1402 startInstanceParams.AvailabilityZone, startInstanceParams.Constraints) 1403 if err2 != nil { 1404 if err = task.setErrorStatus("cannot start instance: %v", machine, err2); err != nil { 1405 task.logger.Errorf("setting error status: %s", err) 1406 } 1407 return err2 1408 } 1409 if azRemaining { 1410 retryMsg = fmt.Sprintf( 1411 "failed to start machine %s in zone %q, retrying in %v with new availability zone: %s", 1412 machine, startInstanceParams.AvailabilityZone, 1413 task.retryStartInstanceStrategy.retryDelay, err, 1414 ) 1415 task.logger.Debugf("%s", retryMsg) 1416 // There's still more zones to try, so don't decrement "attemptsLeft" yet. 1417 retrying = false 1418 } else { 1419 // All availability zones have been attempted for this iteration, 1420 // clear the failures for the next time around. A given zone may 1421 // succeed after a prior failure. 1422 task.clearMachineAZFailures(machine) 1423 } 1424 } 1425 if retrying { 1426 retryMsg = fmt.Sprintf( 1427 "failed to start machine %s (%s), retrying in %v (%d more attempts)", 1428 machine, err.Error(), task.retryStartInstanceStrategy.retryDelay, attemptsLeft, 1429 ) 1430 task.logger.Warningf("%s", retryMsg) 1431 attemptsLeft-- 1432 } 1433 1434 if err3 := machine.SetInstanceStatus(status.Provisioning, retryMsg, nil); err3 != nil { 1435 task.logger.Warningf("failed to set instance status: %v", err3) 1436 } 1437 1438 select { 1439 case <-task.catacomb.Dying(): 1440 return task.catacomb.ErrDying() 1441 case <-time.After(task.retryStartInstanceStrategy.retryDelay): 1442 } 1443 } 1444 1445 networkConfig := params.NetworkConfigFromInterfaceInfo(result.NetworkInfo) 1446 volumes := volumesToAPIServer(result.Volumes) 1447 volumeNameToAttachmentInfo := volumeAttachmentsToAPIServer(result.VolumeAttachments) 1448 instanceID := result.Instance.Id() 1449 1450 // Gather the charm LXD profile names, including the lxd profile names from 1451 // the container brokers. 1452 charmLXDProfiles, err := task.gatherCharmLXDProfiles( 1453 string(instanceID), machine.Tag().Id(), startInstanceParams.CharmLXDProfiles) 1454 if err != nil { 1455 return errors.Trace(err) 1456 } 1457 1458 if err := machine.SetInstanceInfo( 1459 instanceID, 1460 result.DisplayName, 1461 startInstanceParams.InstanceConfig.MachineNonce, 1462 result.Hardware, 1463 networkConfig, 1464 volumes, 1465 volumeNameToAttachmentInfo, 1466 charmLXDProfiles, 1467 ); err != nil { 1468 // We need to stop the instance right away here, set error status and go on. 1469 if err2 := task.setErrorStatus("cannot register instance for machine %v: %v", machine, err); err2 != nil { 1470 task.logger.Errorf("%v", errors.Annotate(err2, "setting machine status")) 1471 } 1472 if err2 := task.broker.StopInstances(ctx, instanceID); err2 != nil { 1473 task.logger.Errorf("%v", errors.Annotate(err2, "after failing to set instance info")) 1474 } 1475 return errors.Annotate(err, "setting instance info") 1476 } 1477 1478 task.logger.Infof( 1479 "started machine %s as instance %s with hardware %q, network config %+v, "+ 1480 "volumes %v, volume attachments %v, subnets to zones %v, lxd profiles %v", 1481 machine, 1482 instanceID, 1483 result.Hardware, 1484 networkConfig, 1485 volumes, 1486 volumeNameToAttachmentInfo, 1487 startInstanceParams.SubnetsToZones, 1488 startInstanceParams.CharmLXDProfiles, 1489 ) 1490 return nil 1491 } 1492 1493 // setupToStartMachine gathers the necessary information, 1494 // based on the specified machine, to create ProvisioningInfo 1495 // and StartInstanceParams to be used by startMachine. 1496 func (task *provisionerTask) setupToStartMachine( 1497 machine apiprovisioner.MachineProvisioner, version *version.Number, pInfoResult params.ProvisioningInfoResult, 1498 ) (environs.StartInstanceParams, error) { 1499 // Check that we have a result. 1500 // We should never have an empty result without an error, 1501 // but we guard for that conservatively. 1502 if pInfoResult.Error != nil { 1503 return environs.StartInstanceParams{}, *pInfoResult.Error 1504 } 1505 pInfo := pInfoResult.Result 1506 if pInfo == nil { 1507 return environs.StartInstanceParams{}, errors.Errorf("no provisioning info for machine %q", machine.Id()) 1508 } 1509 1510 instanceCfg, err := task.constructInstanceConfig(machine, task.auth, pInfo) 1511 if err != nil { 1512 return environs.StartInstanceParams{}, errors.Annotatef(err, "creating instance config for machine %q", machine) 1513 } 1514 1515 // We default to amd64 unless otherwise specified. 1516 agentArch := arch.DefaultArchitecture 1517 if pInfo.Constraints.Arch != nil { 1518 agentArch = *pInfo.Constraints.Arch 1519 } 1520 1521 possibleTools, err := task.toolsFinder.FindTools(*version, pInfo.Base.Name, agentArch) 1522 if err != nil { 1523 return environs.StartInstanceParams{}, errors.Annotatef(err, "finding agent binaries for machine %q", machine) 1524 } 1525 1526 startInstanceParams, err := task.constructStartInstanceParams( 1527 task.controllerUUID, 1528 machine, 1529 instanceCfg, 1530 pInfo, 1531 possibleTools, 1532 ) 1533 if err != nil { 1534 return environs.StartInstanceParams{}, errors.Annotatef(err, "constructing params for machine %q", machine) 1535 } 1536 1537 return startInstanceParams, nil 1538 } 1539 1540 // populateExcludedMachines, translates the results of DeriveAvailabilityZones 1541 // into availabilityZoneMachines.ExcludedMachineIds for machines not to be used 1542 // in the given zone. 1543 func (task *provisionerTask) populateExcludedMachines(ctx context.ProviderCallContext, machineId string, startInstanceParams environs.StartInstanceParams) error { 1544 zonedEnv, ok := task.broker.(providercommon.ZonedEnviron) 1545 if !ok { 1546 return nil 1547 } 1548 derivedZones, err := zonedEnv.DeriveAvailabilityZones(ctx, startInstanceParams) 1549 if err != nil { 1550 return errors.Trace(err) 1551 } 1552 if len(derivedZones) == 0 { 1553 return nil 1554 } 1555 task.machinesMutex.Lock() 1556 defer task.machinesMutex.Unlock() 1557 useZones := set.NewStrings(derivedZones...) 1558 for _, zoneMachines := range task.availabilityZoneMachines { 1559 if !useZones.Contains(zoneMachines.ZoneName) { 1560 zoneMachines.ExcludedMachineIds.Add(machineId) 1561 } 1562 } 1563 return nil 1564 } 1565 1566 // gatherCharmLXDProfiles consumes the charms LXD Profiles from the different 1567 // sources. This includes getting the information from the broker. 1568 func (task *provisionerTask) gatherCharmLXDProfiles( 1569 instanceID, machineTag string, machineProfiles []string, 1570 ) ([]string, error) { 1571 if !names.IsContainerMachine(machineTag) { 1572 return machineProfiles, nil 1573 } 1574 1575 manager, ok := task.broker.(container.LXDProfileNameRetriever) 1576 if !ok { 1577 task.logger.Tracef("failed to gather profile names, broker didn't conform to LXDProfileNameRetriever") 1578 return machineProfiles, nil 1579 } 1580 1581 profileNames, err := manager.LXDProfileNames(instanceID) 1582 if err != nil { 1583 return nil, errors.Trace(err) 1584 } 1585 1586 return lxdprofile.LXDProfileNames(profileNames), nil 1587 } 1588 1589 // markMachineFailedInAZ moves the machine in zone from MachineIds to FailedMachineIds 1590 // in availabilityZoneMachines, report if there are any availability zones not failed for 1591 // the specified machine. 1592 func (task *provisionerTask) markMachineFailedInAZ(machine apiprovisioner.MachineProvisioner, zone string, 1593 cons constraints.Value) (bool, error) { 1594 if zone == "" { 1595 return false, errors.New("no zone provided") 1596 } 1597 task.machinesMutex.Lock() 1598 defer task.machinesMutex.Unlock() 1599 for _, zoneMachines := range task.availabilityZoneMachines { 1600 if zone == zoneMachines.ZoneName { 1601 zoneMachines.MachineIds.Remove(machine.Id()) 1602 zoneMachines.FailedMachineIds.Add(machine.Id()) 1603 break 1604 } 1605 } 1606 1607 // Check if there are any zones left to try (that also match constraints). 1608 for _, zoneMachines := range task.availabilityZoneMachines { 1609 if zoneMachines.MatchesConstraints(cons) && 1610 !zoneMachines.FailedMachineIds.Contains(machine.Id()) && 1611 !zoneMachines.ExcludedMachineIds.Contains(machine.Id()) { 1612 return true, nil 1613 } 1614 } 1615 return false, nil 1616 } 1617 1618 func (task *provisionerTask) clearMachineAZFailures(machine apiprovisioner.MachineProvisioner) { 1619 task.machinesMutex.Lock() 1620 defer task.machinesMutex.Unlock() 1621 for _, zoneMachines := range task.availabilityZoneMachines { 1622 zoneMachines.FailedMachineIds.Remove(machine.Id()) 1623 } 1624 } 1625 1626 // removeMachineFromAZMap removes the specified machine from availabilityZoneMachines. 1627 // It is assumed this is called when the machines are being deleted from state, or failed 1628 // provisioning. 1629 func (task *provisionerTask) removeMachineFromAZMap(machine apiprovisioner.MachineProvisioner) { 1630 machineId := machine.Id() 1631 task.machinesMutex.Lock() 1632 defer task.machinesMutex.Unlock() 1633 for _, zoneMachines := range task.availabilityZoneMachines { 1634 zoneMachines.MachineIds.Remove(machineId) 1635 zoneMachines.FailedMachineIds.Remove(machineId) 1636 } 1637 } 1638 1639 // subnetZonesFromNetworkTopology denormalises the topology passed from the API 1640 // server into a slice of subnet to AZ list maps, one for each listed space. 1641 func subnetZonesFromNetworkTopology(topology params.ProvisioningNetworkTopology) []map[network.Id][]string { 1642 if len(topology.SpaceSubnets) == 0 { 1643 return nil 1644 } 1645 1646 // We want to ensure consistent ordering of the return based on the spaces. 1647 spaceNames := make([]string, 0, len(topology.SpaceSubnets)) 1648 for spaceName := range topology.SpaceSubnets { 1649 spaceNames = append(spaceNames, spaceName) 1650 } 1651 sort.Strings(spaceNames) 1652 1653 subnetsToZones := make([]map[network.Id][]string, 0, len(spaceNames)) 1654 for _, spaceName := range spaceNames { 1655 subnetAZs := make(map[network.Id][]string) 1656 for _, subnet := range topology.SpaceSubnets[spaceName] { 1657 subnetAZs[network.Id(subnet)] = topology.SubnetAZs[subnet] 1658 } 1659 subnetsToZones = append(subnetsToZones, subnetAZs) 1660 } 1661 return subnetsToZones 1662 } 1663 1664 func volumesToAPIServer(volumes []storage.Volume) []params.Volume { 1665 result := make([]params.Volume, len(volumes)) 1666 for i, v := range volumes { 1667 result[i] = params.Volume{ 1668 VolumeTag: v.Tag.String(), 1669 Info: params.VolumeInfo{ 1670 VolumeId: v.VolumeId, 1671 HardwareId: v.HardwareId, 1672 WWN: v.WWN, // pool 1673 Size: v.Size, 1674 Persistent: v.Persistent, 1675 }, 1676 } 1677 } 1678 return result 1679 } 1680 1681 func volumeAttachmentsToAPIServer(attachments []storage.VolumeAttachment) map[string]params.VolumeAttachmentInfo { 1682 result := make(map[string]params.VolumeAttachmentInfo) 1683 for _, a := range attachments { 1684 1685 // Volume attachment plans are used in the OCI provider where actions 1686 // are required on the instance itself in order to complete attachments 1687 // of SCSI volumes. 1688 // TODO (manadart 2020-02-04): I believe this code path to be untested. 1689 var planInfo *params.VolumeAttachmentPlanInfo 1690 if a.PlanInfo != nil { 1691 planInfo = ¶ms.VolumeAttachmentPlanInfo{ 1692 DeviceType: a.PlanInfo.DeviceType, 1693 DeviceAttributes: a.PlanInfo.DeviceAttributes, 1694 } 1695 } 1696 1697 result[a.Volume.String()] = params.VolumeAttachmentInfo{ 1698 DeviceName: a.DeviceName, 1699 DeviceLink: a.DeviceLink, 1700 BusAddress: a.BusAddress, 1701 ReadOnly: a.ReadOnly, 1702 PlanInfo: planInfo, 1703 } 1704 } 1705 return result 1706 }