github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/provisioner/provisioner_task.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package provisioner 5 6 import ( 7 "fmt" 8 "sort" 9 "strings" 10 "sync" 11 "time" 12 13 "github.com/juju/collections/set" 14 "github.com/juju/errors" 15 "github.com/juju/utils" 16 "github.com/juju/version" 17 "gopkg.in/juju/names.v2" 18 "gopkg.in/juju/worker.v1" 19 "gopkg.in/juju/worker.v1/catacomb" 20 21 apiprovisioner "github.com/juju/juju/api/provisioner" 22 "github.com/juju/juju/apiserver/common/networkingcommon" 23 "github.com/juju/juju/apiserver/params" 24 "github.com/juju/juju/cloudconfig/instancecfg" 25 "github.com/juju/juju/container" 26 "github.com/juju/juju/controller" 27 "github.com/juju/juju/controller/authentication" 28 "github.com/juju/juju/core/constraints" 29 "github.com/juju/juju/core/instance" 30 "github.com/juju/juju/core/lxdprofile" 31 "github.com/juju/juju/core/status" 32 "github.com/juju/juju/core/watcher" 33 "github.com/juju/juju/environs" 34 "github.com/juju/juju/environs/config" 35 "github.com/juju/juju/environs/context" 36 "github.com/juju/juju/environs/imagemetadata" 37 "github.com/juju/juju/environs/instances" 38 "github.com/juju/juju/environs/simplestreams" 39 "github.com/juju/juju/network" 40 providercommon "github.com/juju/juju/provider/common" 41 "github.com/juju/juju/state" 42 "github.com/juju/juju/state/multiwatcher" 43 "github.com/juju/juju/storage" 44 coretools "github.com/juju/juju/tools" 45 "github.com/juju/juju/wrench" 46 ) 47 48 type ProvisionerTask interface { 49 worker.Worker 50 51 // SetHarvestMode sets a flag to indicate how the provisioner task 52 // should harvest machines. See config.HarvestMode for 53 // documentation of behavior. 54 SetHarvestMode(mode config.HarvestMode) 55 } 56 57 type MachineGetter interface { 58 Machines(...names.MachineTag) ([]apiprovisioner.MachineResult, error) 59 MachinesWithTransientErrors() ([]apiprovisioner.MachineStatusResult, error) 60 } 61 62 type DistributionGroupFinder interface { 63 DistributionGroupByMachineId(...names.MachineTag) ([]apiprovisioner.DistributionGroupResult, error) 64 } 65 66 // ToolsFinder is an interface used for finding tools to run on 67 // provisioned instances. 68 type ToolsFinder interface { 69 // FindTools returns a list of tools matching the specified 70 // version, series, and architecture. If arch is empty, the 71 // implementation is expected to use a well documented default. 72 FindTools(version version.Number, series string, arch string) (coretools.List, error) 73 } 74 75 func NewProvisionerTask( 76 controllerUUID string, 77 machineTag names.MachineTag, 78 harvestMode config.HarvestMode, 79 machineGetter MachineGetter, 80 distributionGroupFinder DistributionGroupFinder, 81 toolsFinder ToolsFinder, 82 machineWatcher watcher.StringsWatcher, 83 retryWatcher watcher.NotifyWatcher, 84 profileWatcher watcher.StringsWatcher, 85 broker environs.InstanceBroker, 86 auth authentication.AuthenticationProvider, 87 imageStream string, 88 retryStartInstanceStrategy RetryStrategy, 89 cloudCallContext context.ProviderCallContext, 90 ) (ProvisionerTask, error) { 91 machineChanges := machineWatcher.Changes() 92 workers := []worker.Worker{machineWatcher} 93 var retryChanges watcher.NotifyChannel 94 if retryWatcher != nil { 95 retryChanges = retryWatcher.Changes() 96 workers = append(workers, retryWatcher) 97 } 98 profileChanges := profileWatcher.Changes() 99 task := &provisionerTask{ 100 controllerUUID: controllerUUID, 101 machineTag: machineTag, 102 machineGetter: machineGetter, 103 distributionGroupFinder: distributionGroupFinder, 104 toolsFinder: toolsFinder, 105 machineChanges: machineChanges, 106 retryChanges: retryChanges, 107 profileChanges: profileChanges, 108 broker: broker, 109 auth: auth, 110 harvestMode: harvestMode, 111 harvestModeChan: make(chan config.HarvestMode, 1), 112 machines: make(map[string]apiprovisioner.MachineProvisioner), 113 availabilityZoneMachines: make([]*AvailabilityZoneMachine, 0), 114 imageStream: imageStream, 115 retryStartInstanceStrategy: retryStartInstanceStrategy, 116 cloudCallCtx: cloudCallContext, 117 } 118 err := catacomb.Invoke(catacomb.Plan{ 119 Site: &task.catacomb, 120 Work: task.loop, 121 Init: workers, 122 }) 123 if err != nil { 124 return nil, errors.Trace(err) 125 } 126 // Get existing machine distributions. 127 err = task.populateAvailabilityZoneMachines() 128 // Not all providers implement ZonedEnviron 129 if err != nil && !errors.IsNotImplemented(err) { 130 return nil, errors.Trace(err) 131 } 132 return task, nil 133 } 134 135 type provisionerTask struct { 136 controllerUUID string 137 machineTag names.MachineTag 138 machineGetter MachineGetter 139 distributionGroupFinder DistributionGroupFinder 140 toolsFinder ToolsFinder 141 machineChanges watcher.StringsChannel 142 retryChanges watcher.NotifyChannel 143 profileChanges watcher.StringsChannel 144 broker environs.InstanceBroker 145 catacomb catacomb.Catacomb 146 auth authentication.AuthenticationProvider 147 imageStream string 148 harvestMode config.HarvestMode 149 harvestModeChan chan config.HarvestMode 150 retryStartInstanceStrategy RetryStrategy 151 // instance id -> instance 152 instances map[instance.Id]instances.Instance 153 // machine id -> machine 154 machines map[string]apiprovisioner.MachineProvisioner 155 machinesMutex sync.RWMutex 156 availabilityZoneMachines []*AvailabilityZoneMachine 157 cloudCallCtx context.ProviderCallContext 158 } 159 160 // Kill implements worker.Worker.Kill. 161 func (task *provisionerTask) Kill() { 162 task.catacomb.Kill(nil) 163 } 164 165 // Wait implements worker.Worker.Wait. 166 func (task *provisionerTask) Wait() error { 167 return task.catacomb.Wait() 168 } 169 170 func (task *provisionerTask) loop() error { 171 172 // Don't allow the harvesting mode to change until we have read at 173 // least one set of changes, which will populate the task.machines 174 // map. Otherwise we will potentially see all legitimate instances 175 // as unknown. 176 var harvestModeChan chan config.HarvestMode 177 178 // When the watcher is started, it will have the initial changes be all 179 // the machines that are relevant. Also, since this is available straight 180 // away, we know there will be some changes right off the bat. 181 for { 182 select { 183 case <-task.catacomb.Dying(): 184 logger.Infof("Shutting down provisioner task %s", task.machineTag) 185 return task.catacomb.ErrDying() 186 case ids, ok := <-task.machineChanges: 187 if !ok { 188 return errors.New("machine watcher closed channel") 189 } 190 if err := task.processMachines(ids); err != nil { 191 return errors.Annotate(err, "failed to process updated machines") 192 } 193 // We've seen a set of changes. Enable modification of 194 // harvesting mode. 195 harvestModeChan = task.harvestModeChan 196 case harvestMode := <-harvestModeChan: 197 if harvestMode == task.harvestMode { 198 break 199 } 200 logger.Infof("harvesting mode changed to %s", harvestMode) 201 task.harvestMode = harvestMode 202 if harvestMode.HarvestUnknown() { 203 logger.Infof("harvesting unknown machines") 204 if err := task.processMachines(nil); err != nil { 205 return errors.Annotate(err, "failed to process machines after safe mode disabled") 206 } 207 } 208 case <-task.retryChanges: 209 if err := task.processMachinesWithTransientErrors(); err != nil { 210 return errors.Annotate(err, "failed to process machines with transient errors") 211 } 212 case ids, ok := <-task.profileChanges: 213 if !ok { 214 return errors.New("profile watcher closed channel") 215 } 216 if err := task.processProfileChanges(ids); err != nil { 217 return errors.Annotate(err, "failed to process updated charm profiles") 218 } 219 } 220 } 221 } 222 223 // SetHarvestMode implements ProvisionerTask.SetHarvestMode(). 224 func (task *provisionerTask) SetHarvestMode(mode config.HarvestMode) { 225 select { 226 case task.harvestModeChan <- mode: 227 case <-task.catacomb.Dying(): 228 } 229 } 230 231 func (task *provisionerTask) processMachinesWithTransientErrors() error { 232 results, err := task.machineGetter.MachinesWithTransientErrors() 233 if err != nil { 234 return nil 235 } 236 logger.Tracef("processMachinesWithTransientErrors(%v)", results) 237 var pending []apiprovisioner.MachineProvisioner 238 for _, result := range results { 239 if result.Status.Error != nil { 240 logger.Errorf("cannot retry provisioning of machine %q: %v", result.Machine.Id(), result.Status.Error) 241 continue 242 } 243 machine := result.Machine 244 if err := machine.SetStatus(status.Pending, "", nil); err != nil { 245 logger.Errorf("cannot reset status of machine %q: %v", machine.Id(), err) 246 continue 247 } 248 if err := machine.SetInstanceStatus(status.Provisioning, "", nil); err != nil { 249 logger.Errorf("cannot reset instance status of machine %q: %v", machine.Id(), err) 250 continue 251 } 252 task.machinesMutex.Lock() 253 task.machines[machine.Tag().String()] = machine 254 task.machinesMutex.Unlock() 255 pending = append(pending, machine) 256 } 257 return task.startMachines(pending) 258 } 259 260 func (task *provisionerTask) processMachines(ids []string) error { 261 logger.Tracef("processMachines(%v)", ids) 262 263 // Populate the tasks maps of current instances and machines. 264 if err := task.populateMachineMaps(ids); err != nil { 265 return err 266 } 267 268 // Find machines without an instance id or that are dead 269 pending, dead, maintain, err := task.pendingOrDeadOrMaintain(ids) 270 if err != nil { 271 return err 272 } 273 274 // Stop all machines that are dead 275 stopping := task.instancesForDeadMachines(dead) 276 277 // Find running instances that have no machines associated 278 unknown, err := task.findUnknownInstances(stopping) 279 if err != nil { 280 return err 281 } 282 if !task.harvestMode.HarvestUnknown() { 283 logger.Infof( 284 "%s is set to %s; unknown instances not stopped %v", 285 config.ProvisionerHarvestModeKey, 286 task.harvestMode.String(), 287 instanceIds(unknown), 288 ) 289 unknown = nil 290 } 291 if task.harvestMode.HarvestNone() || !task.harvestMode.HarvestDestroyed() { 292 logger.Infof( 293 `%s is set to "%s"; will not harvest %s`, 294 config.ProvisionerHarvestModeKey, 295 task.harvestMode.String(), 296 instanceIds(stopping), 297 ) 298 stopping = nil 299 } 300 301 if len(stopping) > 0 { 302 logger.Infof("stopping known instances %v", stopping) 303 } 304 if len(unknown) > 0 { 305 logger.Infof("stopping unknown instances %v", instanceIds(unknown)) 306 } 307 // It's important that we stop unknown instances before starting 308 // pending ones, because if we start an instance and then fail to 309 // set its InstanceId on the machine we don't want to start a new 310 // instance for the same machine ID. 311 if err := task.stopInstances(append(stopping, unknown...)); err != nil { 312 return err 313 } 314 315 // Remove any dead machines from state. 316 for _, machine := range dead { 317 logger.Infof("removing dead machine %q", machine.Id()) 318 if err := machine.MarkForRemoval(); err != nil { 319 logger.Errorf("failed to remove dead machine %q", machine.Id()) 320 } 321 task.removeMachineFromAZMap(machine) 322 task.machinesMutex.Lock() 323 delete(task.machines, machine.Id()) 324 task.machinesMutex.Unlock() 325 } 326 327 // Any machines that require maintenance get pinged 328 task.maintainMachines(maintain) 329 330 // Start an instance for the pending ones 331 return task.startMachines(pending) 332 } 333 334 // processProfileChanges adds, removes, or updates lxc profiles changes to 335 // existing machines, if supported by the machine's broker. 336 // 337 // If this action is triggered by a charm upgrade, the instance charm profile 338 // data doc is always created. Allowing the uniter to determine if the 339 // profile upgrade is in a terminal state before proceeding with charm 340 // upgrade itself. 341 // 342 // If this action is triggered by a new 2nd unit added to an existing machine, 343 // clean up of the instance charm profile data doc happens here in the case 344 // of lxd profile support in the machine's broker. 345 // 346 // If the broker does not support lxd profiles, it is harder to determine if 347 // the instance charm profile data doc should be cleaned up. Therefore it 348 // gets set to NotSupportedStatus, which then is deleted by the uniter at 349 // it's installation. 350 func (task *provisionerTask) processProfileChanges(ids []string) error { 351 logger.Tracef("processProfileChanges(%v)", ids) 352 if len(ids) == 0 { 353 // TODO: (hml) 2018-11-29 354 // This shouldn't be triggered, until that's fixed 355 // short circuit here when there's nothing to process. 356 return nil 357 } 358 359 machineTags := make([]names.MachineTag, len(ids)) 360 for i, id := range ids { 361 machineTags[i] = names.NewMachineTag(id) 362 } 363 machines, err := task.machineGetter.Machines(machineTags...) 364 if err != nil { 365 return errors.Annotatef(err, "failed to get machines %v", ids) 366 } 367 profileBroker, ok := task.broker.(environs.LXDProfiler) 368 if !ok { 369 logger.Debugf("Attempting to update the profile of a machine that doesn't support profiles") 370 profileUpgradeNotSupported(machines) 371 return nil 372 } 373 for i, mResult := range machines { 374 if mResult.Err != nil { 375 return errors.Annotatef(err, "failed to get machine %v", machineTags[i]) 376 } 377 m := mResult.Machine 378 removeDoc, err := processOneMachineProfileChange(m, profileBroker) 379 if removeDoc { 380 if err != nil { 381 logger.Errorf("cannot upgrade machine's lxd profile: %s", err.Error()) 382 } 383 if err := m.RemoveUpgradeCharmProfileData(); err != nil { 384 logger.Errorf("cannot remove subordinates upgrade charm profile data: %s", err.Error()) 385 } 386 } else if err != nil { 387 logger.Errorf("cannot upgrade machine's lxd profile: %s", err.Error()) 388 if err2 := m.SetUpgradeCharmProfileComplete(lxdprofile.AnnotateErrorStatus(err)); err2 != nil { 389 return errors.Annotatef(err2, "cannot set error status for instance charm profile data for machine %q", m) 390 } 391 // If Error, SetInstanceStatus in the provisioner api will also call 392 // SetStatus. 393 if err2 := m.SetInstanceStatus(status.Error, "cannot upgrade machine's lxd profile: "+err.Error(), nil); err2 != nil { 394 return errors.Annotatef(err2, "cannot set error status for machine %q", m) 395 } 396 } else { 397 // Clean up any residual errors in the machine status from a previous 398 // upgrade charm profile failure. 399 if err2 := m.SetInstanceStatus(status.Running, "Running", nil); err2 != nil { 400 return errors.Annotatef(err2, "cannot set error status for machine %q", m) 401 } 402 if err2 := m.SetStatus(status.Started, "", nil); err2 != nil { 403 return errors.Annotatef(err2, "cannot set error status for machine %q agent", m) 404 } 405 if err2 := m.SetUpgradeCharmProfileComplete(lxdprofile.SuccessStatus); err2 != nil { 406 return errors.Annotatef(err2, "cannot set success status for instance charm profile data for machine %q", m) 407 } 408 } 409 } 410 return nil 411 } 412 413 func profileUpgradeNotSupported(machines []apiprovisioner.MachineResult) { 414 for _, mResult := range machines { 415 if err := mResult.Machine.SetUpgradeCharmProfileComplete(lxdprofile.NotSupportedStatus); err != nil { 416 logger.Errorf("cannot set not supported status for instance charm profile data: %s", err.Error()) 417 } 418 } 419 } 420 421 func processOneMachineProfileChange( 422 m apiprovisioner.MachineProvisioner, 423 profileBroker environs.LXDProfiler, 424 ) (bool, error) { 425 logger.Debugf("processOneMachineProfileChange(%s)", m.Id()) 426 info, err := m.CharmProfileChangeInfo() 427 if err != nil { 428 return false, err 429 } 430 instId, err := m.InstanceId() 431 if err != nil { 432 return false, err 433 } 434 newProfiles, err := profileBroker.ReplaceOrAddInstanceProfile(string(instId), info.OldProfileName, info.NewProfileName, info.LXDProfile) 435 if err != nil { 436 return false, err 437 } 438 // newProfiles: 439 // default 440 // juju-<model> <-- not included on containers 441 // juju-<model>-<application>-<charm-revision> 442 if len(newProfiles) > 1 && newProfiles[0] == "default" { 443 newProfiles = newProfiles[1:] 444 } 445 if len(newProfiles) > 1 { 446 // Remove if not juju-<model>-<application>-<charm-revision> 447 if _, err = lxdprofile.ProfileRevision(newProfiles[0]); err != nil { 448 newProfiles = newProfiles[1:] 449 } 450 } 451 initialAddOfSubordinateProfile := info.Subordinate && info.OldProfileName == "" 452 return initialAddOfSubordinateProfile, m.SetCharmProfiles(newProfiles) 453 } 454 455 func instanceIds(instances []instances.Instance) []string { 456 ids := make([]string, 0, len(instances)) 457 for _, inst := range instances { 458 ids = append(ids, string(inst.Id())) 459 } 460 return ids 461 } 462 463 // populateMachineMaps updates task.instances. Also updates 464 // task.machines map if a list of IDs is given. 465 func (task *provisionerTask) populateMachineMaps(ids []string) error { 466 task.instances = make(map[instance.Id]instances.Instance) 467 468 instances, err := task.broker.AllInstances(task.cloudCallCtx) 469 if err != nil { 470 return errors.Annotate(err, "failed to get all instances from broker") 471 } 472 for _, i := range instances { 473 task.instances[i.Id()] = i 474 } 475 476 // Update the machines map with new data for each of the machines in the 477 // change list. 478 machineTags := make([]names.MachineTag, len(ids)) 479 for i, id := range ids { 480 machineTags[i] = names.NewMachineTag(id) 481 } 482 machines, err := task.machineGetter.Machines(machineTags...) 483 if err != nil { 484 return errors.Annotatef(err, "failed to get machines %v", ids) 485 } 486 task.machinesMutex.Lock() 487 defer task.machinesMutex.Unlock() 488 for i, result := range machines { 489 switch { 490 case result.Err == nil: 491 task.machines[result.Machine.Id()] = result.Machine 492 case params.IsCodeNotFoundOrCodeUnauthorized(result.Err): 493 logger.Debugf("machine %q not found in state", ids[i]) 494 delete(task.machines, ids[i]) 495 default: 496 return errors.Annotatef(result.Err, "failed to get machine %v", ids[i]) 497 } 498 } 499 return nil 500 } 501 502 // pendingOrDead looks up machines with ids and returns those that do not 503 // have an instance id assigned yet, and also those that are dead. 504 func (task *provisionerTask) pendingOrDeadOrMaintain(ids []string) (pending, dead, maintain []apiprovisioner.MachineProvisioner, err error) { 505 task.machinesMutex.RLock() 506 defer task.machinesMutex.RUnlock() 507 for _, id := range ids { 508 machine, found := task.machines[id] 509 if !found { 510 logger.Infof("machine %q not found", id) 511 continue 512 } 513 var classification MachineClassification 514 classification, err = classifyMachine(machine) 515 if err != nil { 516 return // return the error 517 } 518 switch classification { 519 case Pending: 520 pending = append(pending, machine) 521 case Dead: 522 dead = append(dead, machine) 523 case Maintain: 524 maintain = append(maintain, machine) 525 } 526 } 527 logger.Tracef("pending machines: %v", pending) 528 logger.Tracef("dead machines: %v", dead) 529 return 530 } 531 532 type ClassifiableMachine interface { 533 Life() params.Life 534 InstanceId() (instance.Id, error) 535 EnsureDead() error 536 Status() (status.Status, string, error) 537 InstanceStatus() (status.Status, string, error) 538 Id() string 539 } 540 541 type MachineClassification string 542 543 const ( 544 None MachineClassification = "none" 545 Pending MachineClassification = "Pending" 546 Dead MachineClassification = "Dead" 547 Maintain MachineClassification = "Maintain" 548 ) 549 550 func classifyMachine(machine ClassifiableMachine) ( 551 MachineClassification, error) { 552 switch machine.Life() { 553 case params.Dying: 554 if _, err := machine.InstanceId(); err == nil { 555 return None, nil 556 } else if !params.IsCodeNotProvisioned(err) { 557 return None, errors.Annotatef(err, "failed to load dying machine id:%s, details:%v", machine.Id(), machine) 558 } 559 logger.Infof("killing dying, unprovisioned machine %q", machine) 560 if err := machine.EnsureDead(); err != nil { 561 return None, errors.Annotatef(err, "failed to ensure machine dead id:%s, details:%v", machine.Id(), machine) 562 } 563 fallthrough 564 case params.Dead: 565 return Dead, nil 566 } 567 instId, err := machine.InstanceId() 568 if err != nil { 569 if !params.IsCodeNotProvisioned(err) { 570 return None, errors.Annotatef(err, "failed to load machine id:%s, details:%v", machine.Id(), machine) 571 } 572 machineStatus, _, err := machine.Status() 573 if err != nil { 574 logger.Infof("cannot get machine id:%s, details:%v, err:%v", machine.Id(), machine, err) 575 return None, nil 576 } 577 if machineStatus == status.Pending { 578 logger.Infof("found machine pending provisioning id:%s, details:%v", machine.Id(), machine) 579 return Pending, nil 580 } 581 instanceStatus, _, err := machine.InstanceStatus() 582 if err != nil { 583 logger.Infof("cannot read instance status id:%s, details:%v, err:%v", machine.Id(), machine, err) 584 return None, nil 585 } 586 if instanceStatus == status.Provisioning { 587 logger.Infof("found machine provisioning id:%s, details:%v", machine.Id(), machine) 588 return Pending, nil 589 } 590 return None, nil 591 } 592 logger.Infof("machine %s already started as instance %q", machine.Id(), instId) 593 594 if state.ContainerTypeFromId(machine.Id()) != "" { 595 return Maintain, nil 596 } 597 return None, nil 598 } 599 600 // findUnknownInstances finds instances which are not associated with a machine. 601 func (task *provisionerTask) findUnknownInstances(stopping []instances.Instance) ([]instances.Instance, error) { 602 // Make a copy of the instances we know about. 603 taskInstances := make(map[instance.Id]instances.Instance) 604 for k, v := range task.instances { 605 taskInstances[k] = v 606 } 607 608 task.machinesMutex.RLock() 609 defer task.machinesMutex.RUnlock() 610 for _, m := range task.machines { 611 instId, err := m.InstanceId() 612 switch { 613 case err == nil: 614 delete(taskInstances, instId) 615 case params.IsCodeNotProvisioned(err): 616 case params.IsCodeNotFoundOrCodeUnauthorized(err): 617 default: 618 return nil, err 619 } 620 } 621 // Now remove all those instances that we are stopping already as we 622 // know about those and don't want to include them in the unknown list. 623 for _, inst := range stopping { 624 delete(taskInstances, inst.Id()) 625 } 626 var unknown []instances.Instance 627 for _, inst := range taskInstances { 628 unknown = append(unknown, inst) 629 } 630 return unknown, nil 631 } 632 633 // instancesForDeadMachines returns a list of instances.Instance that represent 634 // the list of dead machines running in the provider. Missing machines are 635 // omitted from the list. 636 func (task *provisionerTask) instancesForDeadMachines(deadMachines []apiprovisioner.MachineProvisioner) []instances.Instance { 637 var instances []instances.Instance 638 for _, machine := range deadMachines { 639 instId, err := machine.InstanceId() 640 if err == nil { 641 keep, _ := machine.KeepInstance() 642 if keep { 643 logger.Debugf("machine %v is dead but keep-instance is true", instId) 644 continue 645 } 646 inst, found := task.instances[instId] 647 // If the instance is not found we can't stop it. 648 if found { 649 instances = append(instances, inst) 650 } 651 } 652 } 653 return instances 654 } 655 656 func (task *provisionerTask) stopInstances(instances []instances.Instance) error { 657 // Although calling StopInstance with an empty slice should produce no change in the 658 // provider, environs like dummy do not consider this a noop. 659 if len(instances) == 0 { 660 return nil 661 } 662 if wrench.IsActive("provisioner", "stop-instances") { 663 return errors.New("wrench in the works") 664 } 665 666 ids := make([]instance.Id, len(instances)) 667 for i, inst := range instances { 668 ids[i] = inst.Id() 669 } 670 if err := task.broker.StopInstances(task.cloudCallCtx, ids...); err != nil { 671 return errors.Annotate(err, "broker failed to stop instances") 672 } 673 return nil 674 } 675 676 func (task *provisionerTask) constructInstanceConfig( 677 machine apiprovisioner.MachineProvisioner, 678 auth authentication.AuthenticationProvider, 679 pInfo *params.ProvisioningInfo, 680 ) (*instancecfg.InstanceConfig, error) { 681 682 stateInfo, apiInfo, err := auth.SetupAuthentication(machine) 683 if err != nil { 684 return nil, errors.Annotate(err, "failed to setup authentication") 685 } 686 687 // Generated a nonce for the new instance, with the format: "machine-#:UUID". 688 // The first part is a badge, specifying the tag of the machine the provisioner 689 // is running on, while the second part is a random UUID. 690 uuid, err := utils.NewUUID() 691 if err != nil { 692 return nil, errors.Annotate(err, "failed to generate a nonce for machine "+machine.Id()) 693 } 694 695 nonce := fmt.Sprintf("%s:%s", task.machineTag, uuid) 696 instanceConfig, err := instancecfg.NewInstanceConfig( 697 names.NewControllerTag(controller.Config(pInfo.ControllerConfig).ControllerUUID()), 698 machine.Id(), 699 nonce, 700 task.imageStream, 701 pInfo.Series, 702 apiInfo, 703 ) 704 if err != nil { 705 return nil, errors.Trace(err) 706 } 707 708 instanceConfig.Tags = pInfo.Tags 709 if len(pInfo.Jobs) > 0 { 710 instanceConfig.Jobs = pInfo.Jobs 711 } 712 713 if multiwatcher.AnyJobNeedsState(instanceConfig.Jobs...) { 714 publicKey, err := simplestreams.UserPublicSigningKey() 715 if err != nil { 716 return nil, err 717 } 718 instanceConfig.Controller = &instancecfg.ControllerConfig{ 719 PublicImageSigningKey: publicKey, 720 MongoInfo: stateInfo, 721 } 722 instanceConfig.Controller.Config = make(map[string]interface{}) 723 for k, v := range pInfo.ControllerConfig { 724 instanceConfig.Controller.Config[k] = v 725 } 726 } 727 728 instanceConfig.CloudInitUserData = pInfo.CloudInitUserData 729 730 return instanceConfig, nil 731 } 732 733 func (task *provisionerTask) constructStartInstanceParams( 734 controllerUUID string, 735 machine apiprovisioner.MachineProvisioner, 736 instanceConfig *instancecfg.InstanceConfig, 737 provisioningInfo *params.ProvisioningInfo, 738 possibleTools coretools.List, 739 ) (environs.StartInstanceParams, error) { 740 741 volumes := make([]storage.VolumeParams, len(provisioningInfo.Volumes)) 742 for i, v := range provisioningInfo.Volumes { 743 volumeTag, err := names.ParseVolumeTag(v.VolumeTag) 744 if err != nil { 745 return environs.StartInstanceParams{}, errors.Trace(err) 746 } 747 if v.Attachment == nil { 748 return environs.StartInstanceParams{}, errors.Errorf("volume params missing attachment") 749 } 750 machineTag, err := names.ParseMachineTag(v.Attachment.MachineTag) 751 if err != nil { 752 return environs.StartInstanceParams{}, errors.Trace(err) 753 } 754 if machineTag != machine.Tag() { 755 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params has invalid machine tag") 756 } 757 if v.Attachment.InstanceId != "" { 758 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params specifies instance ID") 759 } 760 volumes[i] = storage.VolumeParams{ 761 Tag: volumeTag, 762 Size: v.Size, 763 Provider: storage.ProviderType(v.Provider), 764 Attributes: v.Attributes, 765 ResourceTags: v.Tags, 766 Attachment: &storage.VolumeAttachmentParams{ 767 AttachmentParams: storage.AttachmentParams{ 768 Machine: machineTag, 769 ReadOnly: v.Attachment.ReadOnly, 770 }, 771 Volume: volumeTag, 772 }, 773 } 774 } 775 volumeAttachments := make([]storage.VolumeAttachmentParams, len(provisioningInfo.VolumeAttachments)) 776 for i, v := range provisioningInfo.VolumeAttachments { 777 volumeTag, err := names.ParseVolumeTag(v.VolumeTag) 778 if err != nil { 779 return environs.StartInstanceParams{}, errors.Trace(err) 780 } 781 machineTag, err := names.ParseMachineTag(v.MachineTag) 782 if err != nil { 783 return environs.StartInstanceParams{}, errors.Trace(err) 784 } 785 if machineTag != machine.Tag() { 786 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params has invalid machine tag") 787 } 788 if v.InstanceId != "" { 789 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params specifies instance ID") 790 } 791 if v.VolumeId == "" { 792 return environs.StartInstanceParams{}, errors.Errorf("volume attachment params does not specify volume ID") 793 } 794 volumeAttachments[i] = storage.VolumeAttachmentParams{ 795 AttachmentParams: storage.AttachmentParams{ 796 Provider: storage.ProviderType(v.Provider), 797 Machine: machineTag, 798 ReadOnly: v.ReadOnly, 799 }, 800 Volume: volumeTag, 801 VolumeId: v.VolumeId, 802 } 803 } 804 805 var subnetsToZones map[network.Id][]string 806 if provisioningInfo.SubnetsToZones != nil { 807 // Convert subnet provider ids from string to network.Id. 808 subnetsToZones = make(map[network.Id][]string, len(provisioningInfo.SubnetsToZones)) 809 for providerId, zones := range provisioningInfo.SubnetsToZones { 810 subnetsToZones[network.Id(providerId)] = zones 811 } 812 } 813 814 var endpointBindings map[string]network.Id 815 if len(provisioningInfo.EndpointBindings) != 0 { 816 endpointBindings = make(map[string]network.Id) 817 for endpoint, space := range provisioningInfo.EndpointBindings { 818 endpointBindings[endpoint] = network.Id(space) 819 } 820 } 821 possibleImageMetadata := make([]*imagemetadata.ImageMetadata, len(provisioningInfo.ImageMetadata)) 822 for i, metadata := range provisioningInfo.ImageMetadata { 823 possibleImageMetadata[i] = &imagemetadata.ImageMetadata{ 824 Id: metadata.ImageId, 825 Arch: metadata.Arch, 826 RegionAlias: metadata.Region, 827 RegionName: metadata.Region, 828 Storage: metadata.RootStorageType, 829 Stream: metadata.Stream, 830 VirtType: metadata.VirtType, 831 Version: metadata.Version, 832 } 833 } 834 835 startInstanceParams := environs.StartInstanceParams{ 836 ControllerUUID: controllerUUID, 837 Constraints: provisioningInfo.Constraints, 838 Tools: possibleTools, 839 InstanceConfig: instanceConfig, 840 Placement: provisioningInfo.Placement, 841 Volumes: volumes, 842 VolumeAttachments: volumeAttachments, 843 SubnetsToZones: subnetsToZones, 844 EndpointBindings: endpointBindings, 845 ImageMetadata: possibleImageMetadata, 846 StatusCallback: machine.SetInstanceStatus, 847 Abort: task.catacomb.Dying(), 848 CharmLXDProfiles: provisioningInfo.CharmLXDProfiles, 849 } 850 851 return startInstanceParams, nil 852 } 853 854 func (task *provisionerTask) maintainMachines(machines []apiprovisioner.MachineProvisioner) error { 855 for _, m := range machines { 856 logger.Infof("maintainMachines: %v", m) 857 startInstanceParams := environs.StartInstanceParams{} 858 startInstanceParams.InstanceConfig = &instancecfg.InstanceConfig{} 859 startInstanceParams.InstanceConfig.MachineId = m.Id() 860 if err := task.broker.MaintainInstance(task.cloudCallCtx, startInstanceParams); err != nil { 861 return errors.Annotatef(err, "cannot maintain machine %v", m) 862 } 863 } 864 return nil 865 } 866 867 // AvailabilityZoneMachine keeps track a single zone and which machines 868 // are in it, which machines have failed to use it and which machines 869 // shouldn't use it. This data is used to decide on how to distribute 870 // machines across availability zones. 871 // 872 // Exposed for testing. 873 type AvailabilityZoneMachine struct { 874 ZoneName string 875 MachineIds set.Strings 876 FailedMachineIds set.Strings 877 ExcludedMachineIds set.Strings // Don't use these machines in the zone. 878 } 879 880 // populateAvailabilityZoneMachines fills in the map, availabilityZoneMachines, 881 // if empty, with a current mapping of availability zone to IDs of machines 882 // running in that zone. If the provider does not implement the ZonedEnviron 883 // interface, return nil. 884 func (task *provisionerTask) populateAvailabilityZoneMachines() error { 885 task.machinesMutex.Lock() 886 defer task.machinesMutex.Unlock() 887 888 if len(task.availabilityZoneMachines) > 0 { 889 return nil 890 } 891 zonedEnv, ok := task.broker.(providercommon.ZonedEnviron) 892 if !ok { 893 return nil 894 } 895 896 // In this case, AvailabilityZoneAllocations() will return all of the "available" 897 // availability zones and their instance allocations. 898 availabilityZoneInstances, err := providercommon.AvailabilityZoneAllocations( 899 zonedEnv, task.cloudCallCtx, []instance.Id{}) 900 if err != nil { 901 return err 902 } 903 904 instanceMachines := make(map[instance.Id]string) 905 for _, machine := range task.machines { 906 instId, err := machine.InstanceId() 907 if err != nil { 908 continue 909 } 910 instanceMachines[instId] = machine.Id() 911 } 912 913 // convert instances IDs to machines IDs to aid distributing 914 // not yet created instances across availability zones. 915 task.availabilityZoneMachines = make([]*AvailabilityZoneMachine, len(availabilityZoneInstances)) 916 for i, instances := range availabilityZoneInstances { 917 machineIds := set.NewStrings() 918 for _, instanceId := range instances.Instances { 919 if id, ok := instanceMachines[instanceId]; ok { 920 machineIds.Add(id) 921 } 922 } 923 task.availabilityZoneMachines[i] = &AvailabilityZoneMachine{ 924 ZoneName: instances.ZoneName, 925 MachineIds: machineIds, 926 FailedMachineIds: set.NewStrings(), 927 ExcludedMachineIds: set.NewStrings(), 928 } 929 } 930 return nil 931 } 932 933 // populateDistributionGroupZoneMap returns a zone mapping which only includes 934 // machines in the same distribution group. This is used to determine where new 935 // machines in that distribution group should be placed. 936 func (task *provisionerTask) populateDistributionGroupZoneMap(machineIds []string) []*AvailabilityZoneMachine { 937 var dgAvailabilityZoneMachines []*AvailabilityZoneMachine 938 dgSet := set.NewStrings(machineIds...) 939 for _, azm := range task.availabilityZoneMachines { 940 dgAvailabilityZoneMachines = append(dgAvailabilityZoneMachines, &AvailabilityZoneMachine{ 941 azm.ZoneName, 942 azm.MachineIds.Intersection(dgSet), 943 azm.FailedMachineIds, 944 azm.ExcludedMachineIds, 945 }) 946 } 947 return dgAvailabilityZoneMachines 948 } 949 950 // machineAvailabilityZoneDistribution returns a suggested availability zone 951 // for the specified machine to start in. 952 // If the current provider does not implement availability zones, "" and no 953 // error will be returned. 954 // Machines are spread across availability zones based on lowest population of 955 // the "available" zones, and any supplied zone constraints. 956 // Machines in the same DistributionGroup are placed in different zones, 957 // distributed based on lowest population of machines in that DistributionGroup. 958 // Machines are not placed in a zone they are excluded from. 959 // If availability zones are implemented and one isn't found, return NotFound error. 960 func (task *provisionerTask) machineAvailabilityZoneDistribution( 961 machineId string, distGroupMachineIds []string, cons constraints.Value, 962 ) (string, error) { 963 task.machinesMutex.Lock() 964 defer task.machinesMutex.Unlock() 965 966 if len(task.availabilityZoneMachines) == 0 { 967 return "", nil 968 } 969 970 // Assign an initial zone to a machine based on lowest population, 971 // accommodating any supplied zone constraints. 972 // If the machine has a distribution group, assign based on lowest zone 973 // population of the distribution group machine. 974 var machineZone string 975 if len(distGroupMachineIds) > 0 { 976 dgZoneMap := azMachineFilterSort(task.populateDistributionGroupZoneMap(distGroupMachineIds)).FilterZones(cons) 977 sort.Sort(dgZoneMap) 978 for _, dgZoneMachines := range dgZoneMap { 979 if !dgZoneMachines.FailedMachineIds.Contains(machineId) && 980 !dgZoneMachines.ExcludedMachineIds.Contains(machineId) { 981 machineZone = dgZoneMachines.ZoneName 982 for _, azm := range task.availabilityZoneMachines { 983 if azm.ZoneName == dgZoneMachines.ZoneName { 984 azm.MachineIds.Add(machineId) 985 break 986 } 987 } 988 break 989 } 990 } 991 } else { 992 zoneMap := azMachineFilterSort(task.availabilityZoneMachines).FilterZones(cons) 993 sort.Sort(zoneMap) 994 for _, zoneMachines := range zoneMap { 995 if !zoneMachines.FailedMachineIds.Contains(machineId) && 996 !zoneMachines.ExcludedMachineIds.Contains(machineId) { 997 machineZone = zoneMachines.ZoneName 998 zoneMachines.MachineIds.Add(machineId) 999 break 1000 } 1001 } 1002 } 1003 if machineZone == "" { 1004 return machineZone, errors.NotFoundf("suitable availability zone for machine %v", machineId) 1005 } 1006 return machineZone, nil 1007 } 1008 1009 // azMachineFilterSort extends a slice of AvailabilityZoneMachine references 1010 // with a sort implementation by zone population and name, 1011 // and filtration based on zones expressed in constraints. 1012 type azMachineFilterSort []*AvailabilityZoneMachine 1013 1014 // FilterZones returns a new instance consisting of slice members limited to 1015 // zones expressed in the input constraints. 1016 // Absence of zone constraints leaves the return unfiltered. 1017 func (a azMachineFilterSort) FilterZones(cons constraints.Value) azMachineFilterSort { 1018 if !cons.HasZones() { 1019 return a 1020 } 1021 1022 logger.Debugf("applying availability zone constraints: %s", strings.Join(*cons.Zones, ", ")) 1023 filtered := a[:0] 1024 for _, azm := range a { 1025 for _, zone := range *cons.Zones { 1026 if azm.ZoneName == zone { 1027 filtered = append(filtered, azm) 1028 break 1029 } 1030 } 1031 } 1032 return filtered 1033 } 1034 1035 func (a azMachineFilterSort) Len() int { 1036 return len(a) 1037 } 1038 1039 func (a azMachineFilterSort) Less(i, j int) bool { 1040 switch { 1041 case a[i].MachineIds.Size() < a[j].MachineIds.Size(): 1042 return true 1043 case a[i].MachineIds.Size() == a[j].MachineIds.Size(): 1044 return a[i].ZoneName < a[j].ZoneName 1045 } 1046 return false 1047 } 1048 1049 func (a azMachineFilterSort) Swap(i, j int) { 1050 a[i], a[j] = a[j], a[i] 1051 } 1052 1053 // startMachines starts a goroutine for each specified machine to 1054 // start it. Errors from individual start machine attempts will be logged. 1055 func (task *provisionerTask) startMachines(machines []apiprovisioner.MachineProvisioner) error { 1056 if len(machines) == 0 { 1057 return nil 1058 } 1059 1060 // Get the distributionGroups for each machine now to avoid 1061 // successive calls to DistributionGroupByMachineId which will 1062 // return the same data. 1063 machineTags := make([]names.MachineTag, len(machines)) 1064 for i, machine := range machines { 1065 machineTags[i] = machine.MachineTag() 1066 } 1067 machineDistributionGroups, err := task.distributionGroupFinder.DistributionGroupByMachineId(machineTags...) 1068 if err != nil { 1069 return err 1070 } 1071 1072 var wg sync.WaitGroup 1073 errMachines := make([]error, len(machines)) 1074 for i, m := range machines { 1075 if machineDistributionGroups[i].Err != nil { 1076 task.setErrorStatus( 1077 "fetching distribution groups for machine %q: %v", 1078 m, machineDistributionGroups[i].Err, 1079 ) 1080 continue 1081 } 1082 wg.Add(1) 1083 go func(machine apiprovisioner.MachineProvisioner, dg []string, index int) { 1084 defer wg.Done() 1085 if err := task.startMachine(machine, dg); err != nil { 1086 task.removeMachineFromAZMap(machine) 1087 errMachines[index] = err 1088 } 1089 }(m, machineDistributionGroups[i].MachineIds, i) 1090 } 1091 1092 wg.Wait() 1093 select { 1094 case <-task.catacomb.Dying(): 1095 return task.catacomb.ErrDying() 1096 default: 1097 } 1098 var errorStrings []string 1099 for _, err := range errMachines { 1100 if err != nil { 1101 errorStrings = append(errorStrings, err.Error()) 1102 } 1103 } 1104 if errorStrings != nil { 1105 return errors.New(strings.Join(errorStrings, "\n")) 1106 } 1107 return nil 1108 } 1109 1110 func (task *provisionerTask) setErrorStatus(message string, machine apiprovisioner.MachineProvisioner, err error) error { 1111 logger.Errorf(message, machine, err) 1112 errForStatus := errors.Cause(err) 1113 if err2 := machine.SetInstanceStatus(status.ProvisioningError, errForStatus.Error(), nil); err2 != nil { 1114 // Something is wrong with this machine, better report it back. 1115 return errors.Annotatef(err2, "cannot set error status for machine %q", machine) 1116 } 1117 return nil 1118 } 1119 1120 // setupToStartMachine gathers the necessary information, 1121 // based on the specified machine, to create ProvisioningInfo 1122 // and StartInstanceParams to be used by startMachine. 1123 func (task *provisionerTask) setupToStartMachine(machine apiprovisioner.MachineProvisioner, version *version.Number) ( 1124 environs.StartInstanceParams, 1125 error, 1126 ) { 1127 pInfo, err := machine.ProvisioningInfo() 1128 if err != nil { 1129 return environs.StartInstanceParams{}, errors.Annotatef(err, "fetching provisioning info for machine %q", machine) 1130 } 1131 1132 instanceCfg, err := task.constructInstanceConfig(machine, task.auth, pInfo) 1133 if err != nil { 1134 return environs.StartInstanceParams{}, errors.Annotatef(err, "creating instance config for machine %q", machine) 1135 } 1136 1137 assocProvInfoAndMachCfg(pInfo, instanceCfg) 1138 1139 var arch string 1140 if pInfo.Constraints.Arch != nil { 1141 arch = *pInfo.Constraints.Arch 1142 } 1143 1144 possibleTools, err := task.toolsFinder.FindTools( 1145 *version, 1146 pInfo.Series, 1147 arch, 1148 ) 1149 if err != nil { 1150 return environs.StartInstanceParams{}, errors.Annotatef(err, "cannot find agent binaries for machine %q", machine) 1151 } 1152 1153 startInstanceParams, err := task.constructStartInstanceParams( 1154 task.controllerUUID, 1155 machine, 1156 instanceCfg, 1157 pInfo, 1158 possibleTools, 1159 ) 1160 if err != nil { 1161 return environs.StartInstanceParams{}, errors.Annotatef(err, "cannot construct params for machine %q", machine) 1162 } 1163 1164 return startInstanceParams, nil 1165 } 1166 1167 // populateExcludedMachines, translates the results of DeriveAvailabilityZones 1168 // into availabilityZoneMachines.ExcludedMachineIds for machines not to be used 1169 // in the given zone. 1170 func (task *provisionerTask) populateExcludedMachines(machineId string, startInstanceParams environs.StartInstanceParams) error { 1171 zonedEnv, ok := task.broker.(providercommon.ZonedEnviron) 1172 if !ok { 1173 return nil 1174 } 1175 derivedZones, err := zonedEnv.DeriveAvailabilityZones(task.cloudCallCtx, startInstanceParams) 1176 if err != nil { 1177 return errors.Trace(err) 1178 } 1179 if len(derivedZones) == 0 { 1180 return nil 1181 } 1182 task.machinesMutex.Lock() 1183 defer task.machinesMutex.Unlock() 1184 useZones := set.NewStrings(derivedZones...) 1185 for _, zoneMachines := range task.availabilityZoneMachines { 1186 if !useZones.Contains(zoneMachines.ZoneName) { 1187 zoneMachines.ExcludedMachineIds.Add(machineId) 1188 } 1189 } 1190 return nil 1191 } 1192 1193 func (task *provisionerTask) startMachine( 1194 machine apiprovisioner.MachineProvisioner, 1195 distributionGroupMachineIds []string, 1196 ) error { 1197 v, err := machine.ModelAgentVersion() 1198 if err != nil { 1199 return err 1200 } 1201 startInstanceParams, err := task.setupToStartMachine(machine, v) 1202 if err != nil { 1203 return task.setErrorStatus("%v", machine, err) 1204 } 1205 1206 // Figure out if the zones available to use for a new instance are 1207 // restricted based on placement, and if so exclude those machines 1208 // from being started in any other zone. 1209 if err := task.populateExcludedMachines(machine.Id(), startInstanceParams); err != nil { 1210 return err 1211 } 1212 1213 // TODO (jam): 2017-01-19 Should we be setting this earlier in the cycle? 1214 if err := machine.SetInstanceStatus(status.Provisioning, "starting", nil); err != nil { 1215 logger.Errorf("%v", err) 1216 } 1217 1218 // TODO ProvisionerParallelization 2017-10-03 1219 // Improve the retry loop, newer methodology 1220 // Is rate limiting handled correctly? 1221 var result *environs.StartInstanceResult 1222 1223 // Attempt creating the instance "retryCount" times. If the provider 1224 // supports availability zones and we're automatically distributing 1225 // across the zones, then we try each zone for every attempt, or until 1226 // one of the StartInstance calls returns an error satisfying 1227 // environs.IsAvailabilityZoneIndependent. 1228 for attemptsLeft := task.retryStartInstanceStrategy.retryCount; attemptsLeft >= 0; { 1229 if startInstanceParams.AvailabilityZone, err = task.machineAvailabilityZoneDistribution( 1230 machine.Id(), distributionGroupMachineIds, startInstanceParams.Constraints, 1231 ); err != nil { 1232 return task.setErrorStatus("cannot start instance for machine %q: %v", machine, err) 1233 } 1234 if startInstanceParams.AvailabilityZone != "" { 1235 logger.Infof("trying machine %s StartInstance in availability zone %s", 1236 machine, startInstanceParams.AvailabilityZone) 1237 } 1238 1239 attemptResult, err := task.broker.StartInstance(task.cloudCallCtx, startInstanceParams) 1240 if err == nil { 1241 result = attemptResult 1242 break 1243 } else if attemptsLeft <= 0 { 1244 // Set the state to error, so the machine will be skipped 1245 // next time until the error is resolved. 1246 task.removeMachineFromAZMap(machine) 1247 return task.setErrorStatus("cannot start instance for machine %q: %v", machine, err) 1248 } 1249 1250 retrying := true 1251 retryMsg := "" 1252 if startInstanceParams.AvailabilityZone != "" && !environs.IsAvailabilityZoneIndependent(err) { 1253 // We've specified a zone, and the error may be specific to 1254 // that zone. Retry in another zone if there are any untried. 1255 azRemaining, err2 := task.markMachineFailedInAZ(machine, startInstanceParams.AvailabilityZone) 1256 if err2 != nil { 1257 if err = task.setErrorStatus("cannot start instance: %v", machine, err2); err != nil { 1258 logger.Errorf("setting error status: %s", err) 1259 } 1260 return err2 1261 } 1262 if azRemaining { 1263 retryMsg = fmt.Sprintf( 1264 "failed to start machine %s in zone %q, retrying in %v with new availability zone: %s", 1265 machine, startInstanceParams.AvailabilityZone, 1266 task.retryStartInstanceStrategy.retryDelay, err, 1267 ) 1268 logger.Debugf("%s", retryMsg) 1269 // There's still more zones to try, so don't decrement "attemptsLeft" yet. 1270 retrying = false 1271 } else { 1272 // All availability zones have been attempted for this iteration, 1273 // clear the failures for the next time around. A given zone may 1274 // succeed after a prior failure. 1275 task.clearMachineAZFailures(machine) 1276 } 1277 } 1278 if retrying { 1279 retryMsg = fmt.Sprintf( 1280 "failed to start machine %s (%s), retrying in %v (%d more attempts)", 1281 machine, err.Error(), task.retryStartInstanceStrategy.retryDelay, attemptsLeft, 1282 ) 1283 logger.Warningf("%s", retryMsg) 1284 attemptsLeft-- 1285 } 1286 1287 if err3 := machine.SetInstanceStatus(status.Provisioning, retryMsg, nil); err3 != nil { 1288 logger.Warningf("failed to set instance status: %v", err3) 1289 } 1290 1291 select { 1292 case <-task.catacomb.Dying(): 1293 return task.catacomb.ErrDying() 1294 case <-time.After(task.retryStartInstanceStrategy.retryDelay): 1295 } 1296 } 1297 1298 networkConfig := networkingcommon.NetworkConfigFromInterfaceInfo(result.NetworkInfo) 1299 volumes := volumesToAPIServer(result.Volumes) 1300 volumeNameToAttachmentInfo := volumeAttachmentsToAPIServer(result.VolumeAttachments) 1301 1302 // gather the charm LXD profile names, including the lxd profile names from 1303 // the container brokers. 1304 charmLXDProfiles := task.gatherCharmLXDProfiles( 1305 string(result.Instance.Id()), 1306 machine.Tag().Id(), 1307 startInstanceParams.CharmLXDProfiles, 1308 ) 1309 1310 if err := machine.SetInstanceInfo( 1311 result.Instance.Id(), 1312 result.DisplayName, 1313 startInstanceParams.InstanceConfig.MachineNonce, 1314 result.Hardware, 1315 networkConfig, 1316 volumes, 1317 volumeNameToAttachmentInfo, 1318 charmLXDProfiles, 1319 ); err != nil { 1320 // We need to stop the instance right away here, set error status and go on. 1321 if err2 := task.setErrorStatus("cannot register instance for machine %v: %v", machine, err); err2 != nil { 1322 logger.Errorf("%v", errors.Annotate(err2, "cannot set machine's status")) 1323 } 1324 if err2 := task.broker.StopInstances(task.cloudCallCtx, result.Instance.Id()); err2 != nil { 1325 logger.Errorf("%v", errors.Annotate(err2, "after failing to set instance info")) 1326 } 1327 return errors.Annotate(err, "cannot set instance info") 1328 } 1329 1330 logger.Infof( 1331 "started machine %s as instance %s with hardware %q, network config %+v, "+ 1332 "volumes %v, volume attachments %v, subnets to zones %v, lxd profiles %v", 1333 machine, 1334 result.Instance.Id(), 1335 result.Hardware, 1336 networkConfig, 1337 volumes, 1338 volumeNameToAttachmentInfo, 1339 startInstanceParams.SubnetsToZones, 1340 startInstanceParams.CharmLXDProfiles, 1341 ) 1342 return nil 1343 } 1344 1345 // gatherCharmLXDProfiles consumes the charms LXD Profiles from the different 1346 // sources. This includes getting the information from the broker. 1347 func (task *provisionerTask) gatherCharmLXDProfiles(instanceId, machineTag string, machineProfiles []string) []string { 1348 if names.IsContainerMachine(machineTag) { 1349 if manager, ok := task.broker.(container.LXDProfileNameRetriever); ok { 1350 if profileNames, err := manager.LXDProfileNames(instanceId); err == nil { 1351 return lxdprofile.LXDProfileNames(profileNames) 1352 } 1353 } else { 1354 logger.Tracef("failed to gather profile names, broker didn't conform to LXDProfileNameRetriever") 1355 } 1356 } 1357 return machineProfiles 1358 } 1359 1360 // markMachineFailedInAZ moves the machine in zone from MachineIds to FailedMachineIds 1361 // in availabilityZoneMachines, report if there are any availability zones not failed for 1362 // the specified machine. 1363 func (task *provisionerTask) markMachineFailedInAZ(machine apiprovisioner.MachineProvisioner, zone string) (bool, error) { 1364 if zone == "" { 1365 return false, errors.New("no zone provided") 1366 } 1367 task.machinesMutex.Lock() 1368 defer task.machinesMutex.Unlock() 1369 azRemaining := false 1370 for _, zoneMachines := range task.availabilityZoneMachines { 1371 if zone == zoneMachines.ZoneName { 1372 zoneMachines.MachineIds.Remove(machine.Id()) 1373 zoneMachines.FailedMachineIds.Add(machine.Id()) 1374 if azRemaining { 1375 break 1376 } 1377 } 1378 if !zoneMachines.FailedMachineIds.Contains(machine.Id()) && 1379 !zoneMachines.ExcludedMachineIds.Contains(machine.Id()) { 1380 azRemaining = true 1381 } 1382 } 1383 return azRemaining, nil 1384 } 1385 1386 func (task *provisionerTask) clearMachineAZFailures(machine apiprovisioner.MachineProvisioner) { 1387 task.machinesMutex.Lock() 1388 defer task.machinesMutex.Unlock() 1389 for _, zoneMachines := range task.availabilityZoneMachines { 1390 zoneMachines.FailedMachineIds.Remove(machine.Id()) 1391 } 1392 } 1393 1394 func (task *provisionerTask) addMachineToAZMap(machine *apiprovisioner.Machine, zoneName string) { 1395 task.machinesMutex.Lock() 1396 defer task.machinesMutex.Unlock() 1397 for _, zoneMachines := range task.availabilityZoneMachines { 1398 if zoneName == zoneMachines.ZoneName { 1399 zoneMachines.MachineIds.Add(machine.Id()) 1400 break 1401 } 1402 } 1403 return 1404 } 1405 1406 // removeMachineFromAZMap removes the specified machine from availabilityZoneMachines. 1407 // It is assumed this is called when the machines are being deleted from state, or failed 1408 // provisioning. 1409 func (task *provisionerTask) removeMachineFromAZMap(machine apiprovisioner.MachineProvisioner) { 1410 machineId := machine.Id() 1411 task.machinesMutex.Lock() 1412 defer task.machinesMutex.Unlock() 1413 for _, zoneMachines := range task.availabilityZoneMachines { 1414 zoneMachines.MachineIds.Remove(machineId) 1415 zoneMachines.FailedMachineIds.Remove(machineId) 1416 } 1417 } 1418 1419 type provisioningInfo struct { 1420 Constraints constraints.Value 1421 Series string 1422 Placement string 1423 InstanceConfig *instancecfg.InstanceConfig 1424 SubnetsToZones map[string][]string 1425 } 1426 1427 func assocProvInfoAndMachCfg( 1428 provInfo *params.ProvisioningInfo, 1429 instanceConfig *instancecfg.InstanceConfig, 1430 ) *provisioningInfo { 1431 return &provisioningInfo{ 1432 Constraints: provInfo.Constraints, 1433 Series: provInfo.Series, 1434 Placement: provInfo.Placement, 1435 InstanceConfig: instanceConfig, 1436 SubnetsToZones: provInfo.SubnetsToZones, 1437 } 1438 } 1439 1440 func volumesToAPIServer(volumes []storage.Volume) []params.Volume { 1441 result := make([]params.Volume, len(volumes)) 1442 for i, v := range volumes { 1443 result[i] = params.Volume{ 1444 VolumeTag: v.Tag.String(), 1445 Info: params.VolumeInfo{ 1446 VolumeId: v.VolumeId, 1447 HardwareId: v.HardwareId, 1448 WWN: v.WWN, // pool 1449 Size: v.Size, 1450 Persistent: v.Persistent, 1451 }, 1452 } 1453 } 1454 return result 1455 } 1456 1457 func volumeAttachmentsToAPIServer(attachments []storage.VolumeAttachment) map[string]params.VolumeAttachmentInfo { 1458 result := make(map[string]params.VolumeAttachmentInfo) 1459 for _, a := range attachments { 1460 var planInfo *params.VolumeAttachmentPlanInfo 1461 if a.PlanInfo != nil { 1462 planInfo.DeviceType = a.PlanInfo.DeviceType 1463 planInfo.DeviceAttributes = a.PlanInfo.DeviceAttributes 1464 } 1465 result[a.Volume.String()] = params.VolumeAttachmentInfo{ 1466 DeviceName: a.DeviceName, 1467 DeviceLink: a.DeviceLink, 1468 BusAddress: a.BusAddress, 1469 ReadOnly: a.ReadOnly, 1470 PlanInfo: planInfo, 1471 } 1472 } 1473 return result 1474 }