github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/provisioner/provisioner_task.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package provisioner
     5  
     6  import (
     7  	"fmt"
     8  	"sort"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/juju/collections/set"
    14  	"github.com/juju/errors"
    15  	"github.com/juju/utils"
    16  	"github.com/juju/version"
    17  	"gopkg.in/juju/names.v2"
    18  	"gopkg.in/juju/worker.v1"
    19  	"gopkg.in/juju/worker.v1/catacomb"
    20  
    21  	apiprovisioner "github.com/juju/juju/api/provisioner"
    22  	"github.com/juju/juju/apiserver/common/networkingcommon"
    23  	"github.com/juju/juju/apiserver/params"
    24  	"github.com/juju/juju/cloudconfig/instancecfg"
    25  	"github.com/juju/juju/container"
    26  	"github.com/juju/juju/controller"
    27  	"github.com/juju/juju/controller/authentication"
    28  	"github.com/juju/juju/core/constraints"
    29  	"github.com/juju/juju/core/instance"
    30  	"github.com/juju/juju/core/lxdprofile"
    31  	"github.com/juju/juju/core/status"
    32  	"github.com/juju/juju/core/watcher"
    33  	"github.com/juju/juju/environs"
    34  	"github.com/juju/juju/environs/config"
    35  	"github.com/juju/juju/environs/context"
    36  	"github.com/juju/juju/environs/imagemetadata"
    37  	"github.com/juju/juju/environs/instances"
    38  	"github.com/juju/juju/environs/simplestreams"
    39  	"github.com/juju/juju/network"
    40  	providercommon "github.com/juju/juju/provider/common"
    41  	"github.com/juju/juju/state"
    42  	"github.com/juju/juju/state/multiwatcher"
    43  	"github.com/juju/juju/storage"
    44  	coretools "github.com/juju/juju/tools"
    45  	"github.com/juju/juju/wrench"
    46  )
    47  
    48  type ProvisionerTask interface {
    49  	worker.Worker
    50  
    51  	// SetHarvestMode sets a flag to indicate how the provisioner task
    52  	// should harvest machines. See config.HarvestMode for
    53  	// documentation of behavior.
    54  	SetHarvestMode(mode config.HarvestMode)
    55  }
    56  
    57  type MachineGetter interface {
    58  	Machines(...names.MachineTag) ([]apiprovisioner.MachineResult, error)
    59  	MachinesWithTransientErrors() ([]apiprovisioner.MachineStatusResult, error)
    60  }
    61  
    62  type DistributionGroupFinder interface {
    63  	DistributionGroupByMachineId(...names.MachineTag) ([]apiprovisioner.DistributionGroupResult, error)
    64  }
    65  
    66  // ToolsFinder is an interface used for finding tools to run on
    67  // provisioned instances.
    68  type ToolsFinder interface {
    69  	// FindTools returns a list of tools matching the specified
    70  	// version, series, and architecture. If arch is empty, the
    71  	// implementation is expected to use a well documented default.
    72  	FindTools(version version.Number, series string, arch string) (coretools.List, error)
    73  }
    74  
    75  func NewProvisionerTask(
    76  	controllerUUID string,
    77  	machineTag names.MachineTag,
    78  	harvestMode config.HarvestMode,
    79  	machineGetter MachineGetter,
    80  	distributionGroupFinder DistributionGroupFinder,
    81  	toolsFinder ToolsFinder,
    82  	machineWatcher watcher.StringsWatcher,
    83  	retryWatcher watcher.NotifyWatcher,
    84  	profileWatcher watcher.StringsWatcher,
    85  	broker environs.InstanceBroker,
    86  	auth authentication.AuthenticationProvider,
    87  	imageStream string,
    88  	retryStartInstanceStrategy RetryStrategy,
    89  	cloudCallContext context.ProviderCallContext,
    90  ) (ProvisionerTask, error) {
    91  	machineChanges := machineWatcher.Changes()
    92  	workers := []worker.Worker{machineWatcher}
    93  	var retryChanges watcher.NotifyChannel
    94  	if retryWatcher != nil {
    95  		retryChanges = retryWatcher.Changes()
    96  		workers = append(workers, retryWatcher)
    97  	}
    98  	profileChanges := profileWatcher.Changes()
    99  	task := &provisionerTask{
   100  		controllerUUID:             controllerUUID,
   101  		machineTag:                 machineTag,
   102  		machineGetter:              machineGetter,
   103  		distributionGroupFinder:    distributionGroupFinder,
   104  		toolsFinder:                toolsFinder,
   105  		machineChanges:             machineChanges,
   106  		retryChanges:               retryChanges,
   107  		profileChanges:             profileChanges,
   108  		broker:                     broker,
   109  		auth:                       auth,
   110  		harvestMode:                harvestMode,
   111  		harvestModeChan:            make(chan config.HarvestMode, 1),
   112  		machines:                   make(map[string]apiprovisioner.MachineProvisioner),
   113  		availabilityZoneMachines:   make([]*AvailabilityZoneMachine, 0),
   114  		imageStream:                imageStream,
   115  		retryStartInstanceStrategy: retryStartInstanceStrategy,
   116  		cloudCallCtx:               cloudCallContext,
   117  	}
   118  	err := catacomb.Invoke(catacomb.Plan{
   119  		Site: &task.catacomb,
   120  		Work: task.loop,
   121  		Init: workers,
   122  	})
   123  	if err != nil {
   124  		return nil, errors.Trace(err)
   125  	}
   126  	// Get existing machine distributions.
   127  	err = task.populateAvailabilityZoneMachines()
   128  	// Not all providers implement ZonedEnviron
   129  	if err != nil && !errors.IsNotImplemented(err) {
   130  		return nil, errors.Trace(err)
   131  	}
   132  	return task, nil
   133  }
   134  
   135  type provisionerTask struct {
   136  	controllerUUID             string
   137  	machineTag                 names.MachineTag
   138  	machineGetter              MachineGetter
   139  	distributionGroupFinder    DistributionGroupFinder
   140  	toolsFinder                ToolsFinder
   141  	machineChanges             watcher.StringsChannel
   142  	retryChanges               watcher.NotifyChannel
   143  	profileChanges             watcher.StringsChannel
   144  	broker                     environs.InstanceBroker
   145  	catacomb                   catacomb.Catacomb
   146  	auth                       authentication.AuthenticationProvider
   147  	imageStream                string
   148  	harvestMode                config.HarvestMode
   149  	harvestModeChan            chan config.HarvestMode
   150  	retryStartInstanceStrategy RetryStrategy
   151  	// instance id -> instance
   152  	instances map[instance.Id]instances.Instance
   153  	// machine id -> machine
   154  	machines                 map[string]apiprovisioner.MachineProvisioner
   155  	machinesMutex            sync.RWMutex
   156  	availabilityZoneMachines []*AvailabilityZoneMachine
   157  	cloudCallCtx             context.ProviderCallContext
   158  }
   159  
   160  // Kill implements worker.Worker.Kill.
   161  func (task *provisionerTask) Kill() {
   162  	task.catacomb.Kill(nil)
   163  }
   164  
   165  // Wait implements worker.Worker.Wait.
   166  func (task *provisionerTask) Wait() error {
   167  	return task.catacomb.Wait()
   168  }
   169  
   170  func (task *provisionerTask) loop() error {
   171  
   172  	// Don't allow the harvesting mode to change until we have read at
   173  	// least one set of changes, which will populate the task.machines
   174  	// map. Otherwise we will potentially see all legitimate instances
   175  	// as unknown.
   176  	var harvestModeChan chan config.HarvestMode
   177  
   178  	// When the watcher is started, it will have the initial changes be all
   179  	// the machines that are relevant. Also, since this is available straight
   180  	// away, we know there will be some changes right off the bat.
   181  	for {
   182  		select {
   183  		case <-task.catacomb.Dying():
   184  			logger.Infof("Shutting down provisioner task %s", task.machineTag)
   185  			return task.catacomb.ErrDying()
   186  		case ids, ok := <-task.machineChanges:
   187  			if !ok {
   188  				return errors.New("machine watcher closed channel")
   189  			}
   190  			if err := task.processMachines(ids); err != nil {
   191  				return errors.Annotate(err, "failed to process updated machines")
   192  			}
   193  			// We've seen a set of changes. Enable modification of
   194  			// harvesting mode.
   195  			harvestModeChan = task.harvestModeChan
   196  		case harvestMode := <-harvestModeChan:
   197  			if harvestMode == task.harvestMode {
   198  				break
   199  			}
   200  			logger.Infof("harvesting mode changed to %s", harvestMode)
   201  			task.harvestMode = harvestMode
   202  			if harvestMode.HarvestUnknown() {
   203  				logger.Infof("harvesting unknown machines")
   204  				if err := task.processMachines(nil); err != nil {
   205  					return errors.Annotate(err, "failed to process machines after safe mode disabled")
   206  				}
   207  			}
   208  		case <-task.retryChanges:
   209  			if err := task.processMachinesWithTransientErrors(); err != nil {
   210  				return errors.Annotate(err, "failed to process machines with transient errors")
   211  			}
   212  		case ids, ok := <-task.profileChanges:
   213  			if !ok {
   214  				return errors.New("profile watcher closed channel")
   215  			}
   216  			if err := task.processProfileChanges(ids); err != nil {
   217  				return errors.Annotate(err, "failed to process updated charm profiles")
   218  			}
   219  		}
   220  	}
   221  }
   222  
   223  // SetHarvestMode implements ProvisionerTask.SetHarvestMode().
   224  func (task *provisionerTask) SetHarvestMode(mode config.HarvestMode) {
   225  	select {
   226  	case task.harvestModeChan <- mode:
   227  	case <-task.catacomb.Dying():
   228  	}
   229  }
   230  
   231  func (task *provisionerTask) processMachinesWithTransientErrors() error {
   232  	results, err := task.machineGetter.MachinesWithTransientErrors()
   233  	if err != nil {
   234  		return nil
   235  	}
   236  	logger.Tracef("processMachinesWithTransientErrors(%v)", results)
   237  	var pending []apiprovisioner.MachineProvisioner
   238  	for _, result := range results {
   239  		if result.Status.Error != nil {
   240  			logger.Errorf("cannot retry provisioning of machine %q: %v", result.Machine.Id(), result.Status.Error)
   241  			continue
   242  		}
   243  		machine := result.Machine
   244  		if err := machine.SetStatus(status.Pending, "", nil); err != nil {
   245  			logger.Errorf("cannot reset status of machine %q: %v", machine.Id(), err)
   246  			continue
   247  		}
   248  		if err := machine.SetInstanceStatus(status.Provisioning, "", nil); err != nil {
   249  			logger.Errorf("cannot reset instance status of machine %q: %v", machine.Id(), err)
   250  			continue
   251  		}
   252  		task.machinesMutex.Lock()
   253  		task.machines[machine.Tag().String()] = machine
   254  		task.machinesMutex.Unlock()
   255  		pending = append(pending, machine)
   256  	}
   257  	return task.startMachines(pending)
   258  }
   259  
   260  func (task *provisionerTask) processMachines(ids []string) error {
   261  	logger.Tracef("processMachines(%v)", ids)
   262  
   263  	// Populate the tasks maps of current instances and machines.
   264  	if err := task.populateMachineMaps(ids); err != nil {
   265  		return err
   266  	}
   267  
   268  	// Find machines without an instance id or that are dead
   269  	pending, dead, maintain, err := task.pendingOrDeadOrMaintain(ids)
   270  	if err != nil {
   271  		return err
   272  	}
   273  
   274  	// Stop all machines that are dead
   275  	stopping := task.instancesForDeadMachines(dead)
   276  
   277  	// Find running instances that have no machines associated
   278  	unknown, err := task.findUnknownInstances(stopping)
   279  	if err != nil {
   280  		return err
   281  	}
   282  	if !task.harvestMode.HarvestUnknown() {
   283  		logger.Infof(
   284  			"%s is set to %s; unknown instances not stopped %v",
   285  			config.ProvisionerHarvestModeKey,
   286  			task.harvestMode.String(),
   287  			instanceIds(unknown),
   288  		)
   289  		unknown = nil
   290  	}
   291  	if task.harvestMode.HarvestNone() || !task.harvestMode.HarvestDestroyed() {
   292  		logger.Infof(
   293  			`%s is set to "%s"; will not harvest %s`,
   294  			config.ProvisionerHarvestModeKey,
   295  			task.harvestMode.String(),
   296  			instanceIds(stopping),
   297  		)
   298  		stopping = nil
   299  	}
   300  
   301  	if len(stopping) > 0 {
   302  		logger.Infof("stopping known instances %v", stopping)
   303  	}
   304  	if len(unknown) > 0 {
   305  		logger.Infof("stopping unknown instances %v", instanceIds(unknown))
   306  	}
   307  	// It's important that we stop unknown instances before starting
   308  	// pending ones, because if we start an instance and then fail to
   309  	// set its InstanceId on the machine we don't want to start a new
   310  	// instance for the same machine ID.
   311  	if err := task.stopInstances(append(stopping, unknown...)); err != nil {
   312  		return err
   313  	}
   314  
   315  	// Remove any dead machines from state.
   316  	for _, machine := range dead {
   317  		logger.Infof("removing dead machine %q", machine.Id())
   318  		if err := machine.MarkForRemoval(); err != nil {
   319  			logger.Errorf("failed to remove dead machine %q", machine.Id())
   320  		}
   321  		task.removeMachineFromAZMap(machine)
   322  		task.machinesMutex.Lock()
   323  		delete(task.machines, machine.Id())
   324  		task.machinesMutex.Unlock()
   325  	}
   326  
   327  	// Any machines that require maintenance get pinged
   328  	task.maintainMachines(maintain)
   329  
   330  	// Start an instance for the pending ones
   331  	return task.startMachines(pending)
   332  }
   333  
   334  // processProfileChanges adds, removes, or updates lxc profiles changes to
   335  // existing machines, if supported by the machine's broker.
   336  //
   337  // If this action is triggered by a charm upgrade, the instance charm profile
   338  // data doc is always created.  Allowing the uniter to determine if the
   339  // profile upgrade is in a terminal state before proceeding with charm
   340  // upgrade itself.
   341  //
   342  // If this action is triggered by a new 2nd unit added to an existing machine,
   343  // clean up of the instance charm profile data doc happens here in the case
   344  // of lxd profile support in the machine's broker.
   345  //
   346  // If the broker does not support lxd profiles, it is harder to determine if
   347  // the instance charm profile data doc should be cleaned up.  Therefore it
   348  // gets set to NotSupportedStatus, which then is deleted by the uniter at
   349  // it's installation.
   350  func (task *provisionerTask) processProfileChanges(ids []string) error {
   351  	logger.Tracef("processProfileChanges(%v)", ids)
   352  	if len(ids) == 0 {
   353  		// TODO: (hml) 2018-11-29
   354  		// This shouldn't be triggered, until that's fixed
   355  		// short circuit here when there's nothing to process.
   356  		return nil
   357  	}
   358  
   359  	machineTags := make([]names.MachineTag, len(ids))
   360  	for i, id := range ids {
   361  		machineTags[i] = names.NewMachineTag(id)
   362  	}
   363  	machines, err := task.machineGetter.Machines(machineTags...)
   364  	if err != nil {
   365  		return errors.Annotatef(err, "failed to get machines %v", ids)
   366  	}
   367  	profileBroker, ok := task.broker.(environs.LXDProfiler)
   368  	if !ok {
   369  		logger.Debugf("Attempting to update the profile of a machine that doesn't support profiles")
   370  		profileUpgradeNotSupported(machines)
   371  		return nil
   372  	}
   373  	for i, mResult := range machines {
   374  		if mResult.Err != nil {
   375  			return errors.Annotatef(err, "failed to get machine %v", machineTags[i])
   376  		}
   377  		m := mResult.Machine
   378  		removeDoc, err := processOneMachineProfileChange(m, profileBroker)
   379  		if removeDoc {
   380  			if err != nil {
   381  				logger.Errorf("cannot upgrade machine's lxd profile: %s", err.Error())
   382  			}
   383  			if err := m.RemoveUpgradeCharmProfileData(); err != nil {
   384  				logger.Errorf("cannot remove subordinates upgrade charm profile data: %s", err.Error())
   385  			}
   386  		} else if err != nil {
   387  			logger.Errorf("cannot upgrade machine's lxd profile: %s", err.Error())
   388  			if err2 := m.SetUpgradeCharmProfileComplete(lxdprofile.AnnotateErrorStatus(err)); err2 != nil {
   389  				return errors.Annotatef(err2, "cannot set error status for instance charm profile data for machine %q", m)
   390  			}
   391  			// If Error, SetInstanceStatus in the provisioner api will also call
   392  			// SetStatus.
   393  			if err2 := m.SetInstanceStatus(status.Error, "cannot upgrade machine's lxd profile: "+err.Error(), nil); err2 != nil {
   394  				return errors.Annotatef(err2, "cannot set error status for machine %q", m)
   395  			}
   396  		} else {
   397  			// Clean up any residual errors in the machine status from a previous
   398  			// upgrade charm profile failure.
   399  			if err2 := m.SetInstanceStatus(status.Running, "Running", nil); err2 != nil {
   400  				return errors.Annotatef(err2, "cannot set error status for machine %q", m)
   401  			}
   402  			if err2 := m.SetStatus(status.Started, "", nil); err2 != nil {
   403  				return errors.Annotatef(err2, "cannot set error status for machine %q agent", m)
   404  			}
   405  			if err2 := m.SetUpgradeCharmProfileComplete(lxdprofile.SuccessStatus); err2 != nil {
   406  				return errors.Annotatef(err2, "cannot set success status for instance charm profile data for machine %q", m)
   407  			}
   408  		}
   409  	}
   410  	return nil
   411  }
   412  
   413  func profileUpgradeNotSupported(machines []apiprovisioner.MachineResult) {
   414  	for _, mResult := range machines {
   415  		if err := mResult.Machine.SetUpgradeCharmProfileComplete(lxdprofile.NotSupportedStatus); err != nil {
   416  			logger.Errorf("cannot set not supported status for instance charm profile data: %s", err.Error())
   417  		}
   418  	}
   419  }
   420  
   421  func processOneMachineProfileChange(
   422  	m apiprovisioner.MachineProvisioner,
   423  	profileBroker environs.LXDProfiler,
   424  ) (bool, error) {
   425  	logger.Debugf("processOneMachineProfileChange(%s)", m.Id())
   426  	info, err := m.CharmProfileChangeInfo()
   427  	if err != nil {
   428  		return false, err
   429  	}
   430  	instId, err := m.InstanceId()
   431  	if err != nil {
   432  		return false, err
   433  	}
   434  	newProfiles, err := profileBroker.ReplaceOrAddInstanceProfile(string(instId), info.OldProfileName, info.NewProfileName, info.LXDProfile)
   435  	if err != nil {
   436  		return false, err
   437  	}
   438  	// newProfiles:
   439  	//   default
   440  	//   juju-<model>      <-- not included on containers
   441  	//   juju-<model>-<application>-<charm-revision>
   442  	if len(newProfiles) > 1 && newProfiles[0] == "default" {
   443  		newProfiles = newProfiles[1:]
   444  	}
   445  	if len(newProfiles) > 1 {
   446  		// Remove if not juju-<model>-<application>-<charm-revision>
   447  		if _, err = lxdprofile.ProfileRevision(newProfiles[0]); err != nil {
   448  			newProfiles = newProfiles[1:]
   449  		}
   450  	}
   451  	initialAddOfSubordinateProfile := info.Subordinate && info.OldProfileName == ""
   452  	return initialAddOfSubordinateProfile, m.SetCharmProfiles(newProfiles)
   453  }
   454  
   455  func instanceIds(instances []instances.Instance) []string {
   456  	ids := make([]string, 0, len(instances))
   457  	for _, inst := range instances {
   458  		ids = append(ids, string(inst.Id()))
   459  	}
   460  	return ids
   461  }
   462  
   463  // populateMachineMaps updates task.instances. Also updates
   464  // task.machines map if a list of IDs is given.
   465  func (task *provisionerTask) populateMachineMaps(ids []string) error {
   466  	task.instances = make(map[instance.Id]instances.Instance)
   467  
   468  	instances, err := task.broker.AllInstances(task.cloudCallCtx)
   469  	if err != nil {
   470  		return errors.Annotate(err, "failed to get all instances from broker")
   471  	}
   472  	for _, i := range instances {
   473  		task.instances[i.Id()] = i
   474  	}
   475  
   476  	// Update the machines map with new data for each of the machines in the
   477  	// change list.
   478  	machineTags := make([]names.MachineTag, len(ids))
   479  	for i, id := range ids {
   480  		machineTags[i] = names.NewMachineTag(id)
   481  	}
   482  	machines, err := task.machineGetter.Machines(machineTags...)
   483  	if err != nil {
   484  		return errors.Annotatef(err, "failed to get machines %v", ids)
   485  	}
   486  	task.machinesMutex.Lock()
   487  	defer task.machinesMutex.Unlock()
   488  	for i, result := range machines {
   489  		switch {
   490  		case result.Err == nil:
   491  			task.machines[result.Machine.Id()] = result.Machine
   492  		case params.IsCodeNotFoundOrCodeUnauthorized(result.Err):
   493  			logger.Debugf("machine %q not found in state", ids[i])
   494  			delete(task.machines, ids[i])
   495  		default:
   496  			return errors.Annotatef(result.Err, "failed to get machine %v", ids[i])
   497  		}
   498  	}
   499  	return nil
   500  }
   501  
   502  // pendingOrDead looks up machines with ids and returns those that do not
   503  // have an instance id assigned yet, and also those that are dead.
   504  func (task *provisionerTask) pendingOrDeadOrMaintain(ids []string) (pending, dead, maintain []apiprovisioner.MachineProvisioner, err error) {
   505  	task.machinesMutex.RLock()
   506  	defer task.machinesMutex.RUnlock()
   507  	for _, id := range ids {
   508  		machine, found := task.machines[id]
   509  		if !found {
   510  			logger.Infof("machine %q not found", id)
   511  			continue
   512  		}
   513  		var classification MachineClassification
   514  		classification, err = classifyMachine(machine)
   515  		if err != nil {
   516  			return // return the error
   517  		}
   518  		switch classification {
   519  		case Pending:
   520  			pending = append(pending, machine)
   521  		case Dead:
   522  			dead = append(dead, machine)
   523  		case Maintain:
   524  			maintain = append(maintain, machine)
   525  		}
   526  	}
   527  	logger.Tracef("pending machines: %v", pending)
   528  	logger.Tracef("dead machines: %v", dead)
   529  	return
   530  }
   531  
   532  type ClassifiableMachine interface {
   533  	Life() params.Life
   534  	InstanceId() (instance.Id, error)
   535  	EnsureDead() error
   536  	Status() (status.Status, string, error)
   537  	InstanceStatus() (status.Status, string, error)
   538  	Id() string
   539  }
   540  
   541  type MachineClassification string
   542  
   543  const (
   544  	None     MachineClassification = "none"
   545  	Pending  MachineClassification = "Pending"
   546  	Dead     MachineClassification = "Dead"
   547  	Maintain MachineClassification = "Maintain"
   548  )
   549  
   550  func classifyMachine(machine ClassifiableMachine) (
   551  	MachineClassification, error) {
   552  	switch machine.Life() {
   553  	case params.Dying:
   554  		if _, err := machine.InstanceId(); err == nil {
   555  			return None, nil
   556  		} else if !params.IsCodeNotProvisioned(err) {
   557  			return None, errors.Annotatef(err, "failed to load dying machine id:%s, details:%v", machine.Id(), machine)
   558  		}
   559  		logger.Infof("killing dying, unprovisioned machine %q", machine)
   560  		if err := machine.EnsureDead(); err != nil {
   561  			return None, errors.Annotatef(err, "failed to ensure machine dead id:%s, details:%v", machine.Id(), machine)
   562  		}
   563  		fallthrough
   564  	case params.Dead:
   565  		return Dead, nil
   566  	}
   567  	instId, err := machine.InstanceId()
   568  	if err != nil {
   569  		if !params.IsCodeNotProvisioned(err) {
   570  			return None, errors.Annotatef(err, "failed to load machine id:%s, details:%v", machine.Id(), machine)
   571  		}
   572  		machineStatus, _, err := machine.Status()
   573  		if err != nil {
   574  			logger.Infof("cannot get machine id:%s, details:%v, err:%v", machine.Id(), machine, err)
   575  			return None, nil
   576  		}
   577  		if machineStatus == status.Pending {
   578  			logger.Infof("found machine pending provisioning id:%s, details:%v", machine.Id(), machine)
   579  			return Pending, nil
   580  		}
   581  		instanceStatus, _, err := machine.InstanceStatus()
   582  		if err != nil {
   583  			logger.Infof("cannot read instance status id:%s, details:%v, err:%v", machine.Id(), machine, err)
   584  			return None, nil
   585  		}
   586  		if instanceStatus == status.Provisioning {
   587  			logger.Infof("found machine provisioning id:%s, details:%v", machine.Id(), machine)
   588  			return Pending, nil
   589  		}
   590  		return None, nil
   591  	}
   592  	logger.Infof("machine %s already started as instance %q", machine.Id(), instId)
   593  
   594  	if state.ContainerTypeFromId(machine.Id()) != "" {
   595  		return Maintain, nil
   596  	}
   597  	return None, nil
   598  }
   599  
   600  // findUnknownInstances finds instances which are not associated with a machine.
   601  func (task *provisionerTask) findUnknownInstances(stopping []instances.Instance) ([]instances.Instance, error) {
   602  	// Make a copy of the instances we know about.
   603  	taskInstances := make(map[instance.Id]instances.Instance)
   604  	for k, v := range task.instances {
   605  		taskInstances[k] = v
   606  	}
   607  
   608  	task.machinesMutex.RLock()
   609  	defer task.machinesMutex.RUnlock()
   610  	for _, m := range task.machines {
   611  		instId, err := m.InstanceId()
   612  		switch {
   613  		case err == nil:
   614  			delete(taskInstances, instId)
   615  		case params.IsCodeNotProvisioned(err):
   616  		case params.IsCodeNotFoundOrCodeUnauthorized(err):
   617  		default:
   618  			return nil, err
   619  		}
   620  	}
   621  	// Now remove all those instances that we are stopping already as we
   622  	// know about those and don't want to include them in the unknown list.
   623  	for _, inst := range stopping {
   624  		delete(taskInstances, inst.Id())
   625  	}
   626  	var unknown []instances.Instance
   627  	for _, inst := range taskInstances {
   628  		unknown = append(unknown, inst)
   629  	}
   630  	return unknown, nil
   631  }
   632  
   633  // instancesForDeadMachines returns a list of instances.Instance that represent
   634  // the list of dead machines running in the provider. Missing machines are
   635  // omitted from the list.
   636  func (task *provisionerTask) instancesForDeadMachines(deadMachines []apiprovisioner.MachineProvisioner) []instances.Instance {
   637  	var instances []instances.Instance
   638  	for _, machine := range deadMachines {
   639  		instId, err := machine.InstanceId()
   640  		if err == nil {
   641  			keep, _ := machine.KeepInstance()
   642  			if keep {
   643  				logger.Debugf("machine %v is dead but keep-instance is true", instId)
   644  				continue
   645  			}
   646  			inst, found := task.instances[instId]
   647  			// If the instance is not found we can't stop it.
   648  			if found {
   649  				instances = append(instances, inst)
   650  			}
   651  		}
   652  	}
   653  	return instances
   654  }
   655  
   656  func (task *provisionerTask) stopInstances(instances []instances.Instance) error {
   657  	// Although calling StopInstance with an empty slice should produce no change in the
   658  	// provider, environs like dummy do not consider this a noop.
   659  	if len(instances) == 0 {
   660  		return nil
   661  	}
   662  	if wrench.IsActive("provisioner", "stop-instances") {
   663  		return errors.New("wrench in the works")
   664  	}
   665  
   666  	ids := make([]instance.Id, len(instances))
   667  	for i, inst := range instances {
   668  		ids[i] = inst.Id()
   669  	}
   670  	if err := task.broker.StopInstances(task.cloudCallCtx, ids...); err != nil {
   671  		return errors.Annotate(err, "broker failed to stop instances")
   672  	}
   673  	return nil
   674  }
   675  
   676  func (task *provisionerTask) constructInstanceConfig(
   677  	machine apiprovisioner.MachineProvisioner,
   678  	auth authentication.AuthenticationProvider,
   679  	pInfo *params.ProvisioningInfo,
   680  ) (*instancecfg.InstanceConfig, error) {
   681  
   682  	stateInfo, apiInfo, err := auth.SetupAuthentication(machine)
   683  	if err != nil {
   684  		return nil, errors.Annotate(err, "failed to setup authentication")
   685  	}
   686  
   687  	// Generated a nonce for the new instance, with the format: "machine-#:UUID".
   688  	// The first part is a badge, specifying the tag of the machine the provisioner
   689  	// is running on, while the second part is a random UUID.
   690  	uuid, err := utils.NewUUID()
   691  	if err != nil {
   692  		return nil, errors.Annotate(err, "failed to generate a nonce for machine "+machine.Id())
   693  	}
   694  
   695  	nonce := fmt.Sprintf("%s:%s", task.machineTag, uuid)
   696  	instanceConfig, err := instancecfg.NewInstanceConfig(
   697  		names.NewControllerTag(controller.Config(pInfo.ControllerConfig).ControllerUUID()),
   698  		machine.Id(),
   699  		nonce,
   700  		task.imageStream,
   701  		pInfo.Series,
   702  		apiInfo,
   703  	)
   704  	if err != nil {
   705  		return nil, errors.Trace(err)
   706  	}
   707  
   708  	instanceConfig.Tags = pInfo.Tags
   709  	if len(pInfo.Jobs) > 0 {
   710  		instanceConfig.Jobs = pInfo.Jobs
   711  	}
   712  
   713  	if multiwatcher.AnyJobNeedsState(instanceConfig.Jobs...) {
   714  		publicKey, err := simplestreams.UserPublicSigningKey()
   715  		if err != nil {
   716  			return nil, err
   717  		}
   718  		instanceConfig.Controller = &instancecfg.ControllerConfig{
   719  			PublicImageSigningKey: publicKey,
   720  			MongoInfo:             stateInfo,
   721  		}
   722  		instanceConfig.Controller.Config = make(map[string]interface{})
   723  		for k, v := range pInfo.ControllerConfig {
   724  			instanceConfig.Controller.Config[k] = v
   725  		}
   726  	}
   727  
   728  	instanceConfig.CloudInitUserData = pInfo.CloudInitUserData
   729  
   730  	return instanceConfig, nil
   731  }
   732  
   733  func (task *provisionerTask) constructStartInstanceParams(
   734  	controllerUUID string,
   735  	machine apiprovisioner.MachineProvisioner,
   736  	instanceConfig *instancecfg.InstanceConfig,
   737  	provisioningInfo *params.ProvisioningInfo,
   738  	possibleTools coretools.List,
   739  ) (environs.StartInstanceParams, error) {
   740  
   741  	volumes := make([]storage.VolumeParams, len(provisioningInfo.Volumes))
   742  	for i, v := range provisioningInfo.Volumes {
   743  		volumeTag, err := names.ParseVolumeTag(v.VolumeTag)
   744  		if err != nil {
   745  			return environs.StartInstanceParams{}, errors.Trace(err)
   746  		}
   747  		if v.Attachment == nil {
   748  			return environs.StartInstanceParams{}, errors.Errorf("volume params missing attachment")
   749  		}
   750  		machineTag, err := names.ParseMachineTag(v.Attachment.MachineTag)
   751  		if err != nil {
   752  			return environs.StartInstanceParams{}, errors.Trace(err)
   753  		}
   754  		if machineTag != machine.Tag() {
   755  			return environs.StartInstanceParams{}, errors.Errorf("volume attachment params has invalid machine tag")
   756  		}
   757  		if v.Attachment.InstanceId != "" {
   758  			return environs.StartInstanceParams{}, errors.Errorf("volume attachment params specifies instance ID")
   759  		}
   760  		volumes[i] = storage.VolumeParams{
   761  			Tag:          volumeTag,
   762  			Size:         v.Size,
   763  			Provider:     storage.ProviderType(v.Provider),
   764  			Attributes:   v.Attributes,
   765  			ResourceTags: v.Tags,
   766  			Attachment: &storage.VolumeAttachmentParams{
   767  				AttachmentParams: storage.AttachmentParams{
   768  					Machine:  machineTag,
   769  					ReadOnly: v.Attachment.ReadOnly,
   770  				},
   771  				Volume: volumeTag,
   772  			},
   773  		}
   774  	}
   775  	volumeAttachments := make([]storage.VolumeAttachmentParams, len(provisioningInfo.VolumeAttachments))
   776  	for i, v := range provisioningInfo.VolumeAttachments {
   777  		volumeTag, err := names.ParseVolumeTag(v.VolumeTag)
   778  		if err != nil {
   779  			return environs.StartInstanceParams{}, errors.Trace(err)
   780  		}
   781  		machineTag, err := names.ParseMachineTag(v.MachineTag)
   782  		if err != nil {
   783  			return environs.StartInstanceParams{}, errors.Trace(err)
   784  		}
   785  		if machineTag != machine.Tag() {
   786  			return environs.StartInstanceParams{}, errors.Errorf("volume attachment params has invalid machine tag")
   787  		}
   788  		if v.InstanceId != "" {
   789  			return environs.StartInstanceParams{}, errors.Errorf("volume attachment params specifies instance ID")
   790  		}
   791  		if v.VolumeId == "" {
   792  			return environs.StartInstanceParams{}, errors.Errorf("volume attachment params does not specify volume ID")
   793  		}
   794  		volumeAttachments[i] = storage.VolumeAttachmentParams{
   795  			AttachmentParams: storage.AttachmentParams{
   796  				Provider: storage.ProviderType(v.Provider),
   797  				Machine:  machineTag,
   798  				ReadOnly: v.ReadOnly,
   799  			},
   800  			Volume:   volumeTag,
   801  			VolumeId: v.VolumeId,
   802  		}
   803  	}
   804  
   805  	var subnetsToZones map[network.Id][]string
   806  	if provisioningInfo.SubnetsToZones != nil {
   807  		// Convert subnet provider ids from string to network.Id.
   808  		subnetsToZones = make(map[network.Id][]string, len(provisioningInfo.SubnetsToZones))
   809  		for providerId, zones := range provisioningInfo.SubnetsToZones {
   810  			subnetsToZones[network.Id(providerId)] = zones
   811  		}
   812  	}
   813  
   814  	var endpointBindings map[string]network.Id
   815  	if len(provisioningInfo.EndpointBindings) != 0 {
   816  		endpointBindings = make(map[string]network.Id)
   817  		for endpoint, space := range provisioningInfo.EndpointBindings {
   818  			endpointBindings[endpoint] = network.Id(space)
   819  		}
   820  	}
   821  	possibleImageMetadata := make([]*imagemetadata.ImageMetadata, len(provisioningInfo.ImageMetadata))
   822  	for i, metadata := range provisioningInfo.ImageMetadata {
   823  		possibleImageMetadata[i] = &imagemetadata.ImageMetadata{
   824  			Id:          metadata.ImageId,
   825  			Arch:        metadata.Arch,
   826  			RegionAlias: metadata.Region,
   827  			RegionName:  metadata.Region,
   828  			Storage:     metadata.RootStorageType,
   829  			Stream:      metadata.Stream,
   830  			VirtType:    metadata.VirtType,
   831  			Version:     metadata.Version,
   832  		}
   833  	}
   834  
   835  	startInstanceParams := environs.StartInstanceParams{
   836  		ControllerUUID:    controllerUUID,
   837  		Constraints:       provisioningInfo.Constraints,
   838  		Tools:             possibleTools,
   839  		InstanceConfig:    instanceConfig,
   840  		Placement:         provisioningInfo.Placement,
   841  		Volumes:           volumes,
   842  		VolumeAttachments: volumeAttachments,
   843  		SubnetsToZones:    subnetsToZones,
   844  		EndpointBindings:  endpointBindings,
   845  		ImageMetadata:     possibleImageMetadata,
   846  		StatusCallback:    machine.SetInstanceStatus,
   847  		Abort:             task.catacomb.Dying(),
   848  		CharmLXDProfiles:  provisioningInfo.CharmLXDProfiles,
   849  	}
   850  
   851  	return startInstanceParams, nil
   852  }
   853  
   854  func (task *provisionerTask) maintainMachines(machines []apiprovisioner.MachineProvisioner) error {
   855  	for _, m := range machines {
   856  		logger.Infof("maintainMachines: %v", m)
   857  		startInstanceParams := environs.StartInstanceParams{}
   858  		startInstanceParams.InstanceConfig = &instancecfg.InstanceConfig{}
   859  		startInstanceParams.InstanceConfig.MachineId = m.Id()
   860  		if err := task.broker.MaintainInstance(task.cloudCallCtx, startInstanceParams); err != nil {
   861  			return errors.Annotatef(err, "cannot maintain machine %v", m)
   862  		}
   863  	}
   864  	return nil
   865  }
   866  
   867  // AvailabilityZoneMachine keeps track a single zone and which machines
   868  // are in it, which machines have failed to use it and which machines
   869  // shouldn't use it. This data is used to decide on how to distribute
   870  // machines across availability zones.
   871  //
   872  // Exposed for testing.
   873  type AvailabilityZoneMachine struct {
   874  	ZoneName           string
   875  	MachineIds         set.Strings
   876  	FailedMachineIds   set.Strings
   877  	ExcludedMachineIds set.Strings // Don't use these machines in the zone.
   878  }
   879  
   880  // populateAvailabilityZoneMachines fills in the map, availabilityZoneMachines,
   881  // if empty, with a current mapping of availability zone to IDs of machines
   882  // running in that zone.  If the provider does not implement the ZonedEnviron
   883  // interface, return nil.
   884  func (task *provisionerTask) populateAvailabilityZoneMachines() error {
   885  	task.machinesMutex.Lock()
   886  	defer task.machinesMutex.Unlock()
   887  
   888  	if len(task.availabilityZoneMachines) > 0 {
   889  		return nil
   890  	}
   891  	zonedEnv, ok := task.broker.(providercommon.ZonedEnviron)
   892  	if !ok {
   893  		return nil
   894  	}
   895  
   896  	// In this case, AvailabilityZoneAllocations() will return all of the "available"
   897  	// availability zones and their instance allocations.
   898  	availabilityZoneInstances, err := providercommon.AvailabilityZoneAllocations(
   899  		zonedEnv, task.cloudCallCtx, []instance.Id{})
   900  	if err != nil {
   901  		return err
   902  	}
   903  
   904  	instanceMachines := make(map[instance.Id]string)
   905  	for _, machine := range task.machines {
   906  		instId, err := machine.InstanceId()
   907  		if err != nil {
   908  			continue
   909  		}
   910  		instanceMachines[instId] = machine.Id()
   911  	}
   912  
   913  	// convert instances IDs to machines IDs to aid distributing
   914  	// not yet created instances across availability zones.
   915  	task.availabilityZoneMachines = make([]*AvailabilityZoneMachine, len(availabilityZoneInstances))
   916  	for i, instances := range availabilityZoneInstances {
   917  		machineIds := set.NewStrings()
   918  		for _, instanceId := range instances.Instances {
   919  			if id, ok := instanceMachines[instanceId]; ok {
   920  				machineIds.Add(id)
   921  			}
   922  		}
   923  		task.availabilityZoneMachines[i] = &AvailabilityZoneMachine{
   924  			ZoneName:           instances.ZoneName,
   925  			MachineIds:         machineIds,
   926  			FailedMachineIds:   set.NewStrings(),
   927  			ExcludedMachineIds: set.NewStrings(),
   928  		}
   929  	}
   930  	return nil
   931  }
   932  
   933  // populateDistributionGroupZoneMap returns a zone mapping which only includes
   934  // machines in the same distribution group.  This is used to determine where new
   935  // machines in that distribution group should be placed.
   936  func (task *provisionerTask) populateDistributionGroupZoneMap(machineIds []string) []*AvailabilityZoneMachine {
   937  	var dgAvailabilityZoneMachines []*AvailabilityZoneMachine
   938  	dgSet := set.NewStrings(machineIds...)
   939  	for _, azm := range task.availabilityZoneMachines {
   940  		dgAvailabilityZoneMachines = append(dgAvailabilityZoneMachines, &AvailabilityZoneMachine{
   941  			azm.ZoneName,
   942  			azm.MachineIds.Intersection(dgSet),
   943  			azm.FailedMachineIds,
   944  			azm.ExcludedMachineIds,
   945  		})
   946  	}
   947  	return dgAvailabilityZoneMachines
   948  }
   949  
   950  // machineAvailabilityZoneDistribution returns a suggested availability zone
   951  // for the specified machine to start in.
   952  // If the current provider does not implement availability zones, "" and no
   953  // error will be returned.
   954  // Machines are spread across availability zones based on lowest population of
   955  // the "available" zones, and any supplied zone constraints.
   956  // Machines in the same DistributionGroup are placed in different zones,
   957  // distributed based on lowest population of machines in that DistributionGroup.
   958  // Machines are not placed in a zone they are excluded from.
   959  // If availability zones are implemented and one isn't found, return NotFound error.
   960  func (task *provisionerTask) machineAvailabilityZoneDistribution(
   961  	machineId string, distGroupMachineIds []string, cons constraints.Value,
   962  ) (string, error) {
   963  	task.machinesMutex.Lock()
   964  	defer task.machinesMutex.Unlock()
   965  
   966  	if len(task.availabilityZoneMachines) == 0 {
   967  		return "", nil
   968  	}
   969  
   970  	// Assign an initial zone to a machine based on lowest population,
   971  	// accommodating any supplied zone constraints.
   972  	// If the machine has a distribution group, assign based on lowest zone
   973  	// population of the distribution group machine.
   974  	var machineZone string
   975  	if len(distGroupMachineIds) > 0 {
   976  		dgZoneMap := azMachineFilterSort(task.populateDistributionGroupZoneMap(distGroupMachineIds)).FilterZones(cons)
   977  		sort.Sort(dgZoneMap)
   978  		for _, dgZoneMachines := range dgZoneMap {
   979  			if !dgZoneMachines.FailedMachineIds.Contains(machineId) &&
   980  				!dgZoneMachines.ExcludedMachineIds.Contains(machineId) {
   981  				machineZone = dgZoneMachines.ZoneName
   982  				for _, azm := range task.availabilityZoneMachines {
   983  					if azm.ZoneName == dgZoneMachines.ZoneName {
   984  						azm.MachineIds.Add(machineId)
   985  						break
   986  					}
   987  				}
   988  				break
   989  			}
   990  		}
   991  	} else {
   992  		zoneMap := azMachineFilterSort(task.availabilityZoneMachines).FilterZones(cons)
   993  		sort.Sort(zoneMap)
   994  		for _, zoneMachines := range zoneMap {
   995  			if !zoneMachines.FailedMachineIds.Contains(machineId) &&
   996  				!zoneMachines.ExcludedMachineIds.Contains(machineId) {
   997  				machineZone = zoneMachines.ZoneName
   998  				zoneMachines.MachineIds.Add(machineId)
   999  				break
  1000  			}
  1001  		}
  1002  	}
  1003  	if machineZone == "" {
  1004  		return machineZone, errors.NotFoundf("suitable availability zone for machine %v", machineId)
  1005  	}
  1006  	return machineZone, nil
  1007  }
  1008  
  1009  // azMachineFilterSort extends a slice of AvailabilityZoneMachine references
  1010  // with a sort implementation by zone population and name,
  1011  // and filtration based on zones expressed in constraints.
  1012  type azMachineFilterSort []*AvailabilityZoneMachine
  1013  
  1014  // FilterZones returns a new instance consisting of slice members limited to
  1015  // zones expressed in the input constraints.
  1016  // Absence of zone constraints leaves the return unfiltered.
  1017  func (a azMachineFilterSort) FilterZones(cons constraints.Value) azMachineFilterSort {
  1018  	if !cons.HasZones() {
  1019  		return a
  1020  	}
  1021  
  1022  	logger.Debugf("applying availability zone constraints: %s", strings.Join(*cons.Zones, ", "))
  1023  	filtered := a[:0]
  1024  	for _, azm := range a {
  1025  		for _, zone := range *cons.Zones {
  1026  			if azm.ZoneName == zone {
  1027  				filtered = append(filtered, azm)
  1028  				break
  1029  			}
  1030  		}
  1031  	}
  1032  	return filtered
  1033  }
  1034  
  1035  func (a azMachineFilterSort) Len() int {
  1036  	return len(a)
  1037  }
  1038  
  1039  func (a azMachineFilterSort) Less(i, j int) bool {
  1040  	switch {
  1041  	case a[i].MachineIds.Size() < a[j].MachineIds.Size():
  1042  		return true
  1043  	case a[i].MachineIds.Size() == a[j].MachineIds.Size():
  1044  		return a[i].ZoneName < a[j].ZoneName
  1045  	}
  1046  	return false
  1047  }
  1048  
  1049  func (a azMachineFilterSort) Swap(i, j int) {
  1050  	a[i], a[j] = a[j], a[i]
  1051  }
  1052  
  1053  // startMachines starts a goroutine for each specified machine to
  1054  // start it.  Errors from individual start machine attempts will be logged.
  1055  func (task *provisionerTask) startMachines(machines []apiprovisioner.MachineProvisioner) error {
  1056  	if len(machines) == 0 {
  1057  		return nil
  1058  	}
  1059  
  1060  	// Get the distributionGroups for each machine now to avoid
  1061  	// successive calls to DistributionGroupByMachineId which will
  1062  	// return the same data.
  1063  	machineTags := make([]names.MachineTag, len(machines))
  1064  	for i, machine := range machines {
  1065  		machineTags[i] = machine.MachineTag()
  1066  	}
  1067  	machineDistributionGroups, err := task.distributionGroupFinder.DistributionGroupByMachineId(machineTags...)
  1068  	if err != nil {
  1069  		return err
  1070  	}
  1071  
  1072  	var wg sync.WaitGroup
  1073  	errMachines := make([]error, len(machines))
  1074  	for i, m := range machines {
  1075  		if machineDistributionGroups[i].Err != nil {
  1076  			task.setErrorStatus(
  1077  				"fetching distribution groups for machine %q: %v",
  1078  				m, machineDistributionGroups[i].Err,
  1079  			)
  1080  			continue
  1081  		}
  1082  		wg.Add(1)
  1083  		go func(machine apiprovisioner.MachineProvisioner, dg []string, index int) {
  1084  			defer wg.Done()
  1085  			if err := task.startMachine(machine, dg); err != nil {
  1086  				task.removeMachineFromAZMap(machine)
  1087  				errMachines[index] = err
  1088  			}
  1089  		}(m, machineDistributionGroups[i].MachineIds, i)
  1090  	}
  1091  
  1092  	wg.Wait()
  1093  	select {
  1094  	case <-task.catacomb.Dying():
  1095  		return task.catacomb.ErrDying()
  1096  	default:
  1097  	}
  1098  	var errorStrings []string
  1099  	for _, err := range errMachines {
  1100  		if err != nil {
  1101  			errorStrings = append(errorStrings, err.Error())
  1102  		}
  1103  	}
  1104  	if errorStrings != nil {
  1105  		return errors.New(strings.Join(errorStrings, "\n"))
  1106  	}
  1107  	return nil
  1108  }
  1109  
  1110  func (task *provisionerTask) setErrorStatus(message string, machine apiprovisioner.MachineProvisioner, err error) error {
  1111  	logger.Errorf(message, machine, err)
  1112  	errForStatus := errors.Cause(err)
  1113  	if err2 := machine.SetInstanceStatus(status.ProvisioningError, errForStatus.Error(), nil); err2 != nil {
  1114  		// Something is wrong with this machine, better report it back.
  1115  		return errors.Annotatef(err2, "cannot set error status for machine %q", machine)
  1116  	}
  1117  	return nil
  1118  }
  1119  
  1120  // setupToStartMachine gathers the necessary information,
  1121  // based on the specified machine, to create ProvisioningInfo
  1122  // and StartInstanceParams to be used by startMachine.
  1123  func (task *provisionerTask) setupToStartMachine(machine apiprovisioner.MachineProvisioner, version *version.Number) (
  1124  	environs.StartInstanceParams,
  1125  	error,
  1126  ) {
  1127  	pInfo, err := machine.ProvisioningInfo()
  1128  	if err != nil {
  1129  		return environs.StartInstanceParams{}, errors.Annotatef(err, "fetching provisioning info for machine %q", machine)
  1130  	}
  1131  
  1132  	instanceCfg, err := task.constructInstanceConfig(machine, task.auth, pInfo)
  1133  	if err != nil {
  1134  		return environs.StartInstanceParams{}, errors.Annotatef(err, "creating instance config for machine %q", machine)
  1135  	}
  1136  
  1137  	assocProvInfoAndMachCfg(pInfo, instanceCfg)
  1138  
  1139  	var arch string
  1140  	if pInfo.Constraints.Arch != nil {
  1141  		arch = *pInfo.Constraints.Arch
  1142  	}
  1143  
  1144  	possibleTools, err := task.toolsFinder.FindTools(
  1145  		*version,
  1146  		pInfo.Series,
  1147  		arch,
  1148  	)
  1149  	if err != nil {
  1150  		return environs.StartInstanceParams{}, errors.Annotatef(err, "cannot find agent binaries for machine %q", machine)
  1151  	}
  1152  
  1153  	startInstanceParams, err := task.constructStartInstanceParams(
  1154  		task.controllerUUID,
  1155  		machine,
  1156  		instanceCfg,
  1157  		pInfo,
  1158  		possibleTools,
  1159  	)
  1160  	if err != nil {
  1161  		return environs.StartInstanceParams{}, errors.Annotatef(err, "cannot construct params for machine %q", machine)
  1162  	}
  1163  
  1164  	return startInstanceParams, nil
  1165  }
  1166  
  1167  // populateExcludedMachines, translates the results of DeriveAvailabilityZones
  1168  // into availabilityZoneMachines.ExcludedMachineIds for machines not to be used
  1169  // in the given zone.
  1170  func (task *provisionerTask) populateExcludedMachines(machineId string, startInstanceParams environs.StartInstanceParams) error {
  1171  	zonedEnv, ok := task.broker.(providercommon.ZonedEnviron)
  1172  	if !ok {
  1173  		return nil
  1174  	}
  1175  	derivedZones, err := zonedEnv.DeriveAvailabilityZones(task.cloudCallCtx, startInstanceParams)
  1176  	if err != nil {
  1177  		return errors.Trace(err)
  1178  	}
  1179  	if len(derivedZones) == 0 {
  1180  		return nil
  1181  	}
  1182  	task.machinesMutex.Lock()
  1183  	defer task.machinesMutex.Unlock()
  1184  	useZones := set.NewStrings(derivedZones...)
  1185  	for _, zoneMachines := range task.availabilityZoneMachines {
  1186  		if !useZones.Contains(zoneMachines.ZoneName) {
  1187  			zoneMachines.ExcludedMachineIds.Add(machineId)
  1188  		}
  1189  	}
  1190  	return nil
  1191  }
  1192  
  1193  func (task *provisionerTask) startMachine(
  1194  	machine apiprovisioner.MachineProvisioner,
  1195  	distributionGroupMachineIds []string,
  1196  ) error {
  1197  	v, err := machine.ModelAgentVersion()
  1198  	if err != nil {
  1199  		return err
  1200  	}
  1201  	startInstanceParams, err := task.setupToStartMachine(machine, v)
  1202  	if err != nil {
  1203  		return task.setErrorStatus("%v", machine, err)
  1204  	}
  1205  
  1206  	// Figure out if the zones available to use for a new instance are
  1207  	// restricted based on placement, and if so exclude those machines
  1208  	// from being started in any other zone.
  1209  	if err := task.populateExcludedMachines(machine.Id(), startInstanceParams); err != nil {
  1210  		return err
  1211  	}
  1212  
  1213  	// TODO (jam): 2017-01-19 Should we be setting this earlier in the cycle?
  1214  	if err := machine.SetInstanceStatus(status.Provisioning, "starting", nil); err != nil {
  1215  		logger.Errorf("%v", err)
  1216  	}
  1217  
  1218  	// TODO ProvisionerParallelization 2017-10-03
  1219  	// Improve the retry loop, newer methodology
  1220  	// Is rate limiting handled correctly?
  1221  	var result *environs.StartInstanceResult
  1222  
  1223  	// Attempt creating the instance "retryCount" times. If the provider
  1224  	// supports availability zones and we're automatically distributing
  1225  	// across the zones, then we try each zone for every attempt, or until
  1226  	// one of the StartInstance calls returns an error satisfying
  1227  	// environs.IsAvailabilityZoneIndependent.
  1228  	for attemptsLeft := task.retryStartInstanceStrategy.retryCount; attemptsLeft >= 0; {
  1229  		if startInstanceParams.AvailabilityZone, err = task.machineAvailabilityZoneDistribution(
  1230  			machine.Id(), distributionGroupMachineIds, startInstanceParams.Constraints,
  1231  		); err != nil {
  1232  			return task.setErrorStatus("cannot start instance for machine %q: %v", machine, err)
  1233  		}
  1234  		if startInstanceParams.AvailabilityZone != "" {
  1235  			logger.Infof("trying machine %s StartInstance in availability zone %s",
  1236  				machine, startInstanceParams.AvailabilityZone)
  1237  		}
  1238  
  1239  		attemptResult, err := task.broker.StartInstance(task.cloudCallCtx, startInstanceParams)
  1240  		if err == nil {
  1241  			result = attemptResult
  1242  			break
  1243  		} else if attemptsLeft <= 0 {
  1244  			// Set the state to error, so the machine will be skipped
  1245  			// next time until the error is resolved.
  1246  			task.removeMachineFromAZMap(machine)
  1247  			return task.setErrorStatus("cannot start instance for machine %q: %v", machine, err)
  1248  		}
  1249  
  1250  		retrying := true
  1251  		retryMsg := ""
  1252  		if startInstanceParams.AvailabilityZone != "" && !environs.IsAvailabilityZoneIndependent(err) {
  1253  			// We've specified a zone, and the error may be specific to
  1254  			// that zone. Retry in another zone if there are any untried.
  1255  			azRemaining, err2 := task.markMachineFailedInAZ(machine, startInstanceParams.AvailabilityZone)
  1256  			if err2 != nil {
  1257  				if err = task.setErrorStatus("cannot start instance: %v", machine, err2); err != nil {
  1258  					logger.Errorf("setting error status: %s", err)
  1259  				}
  1260  				return err2
  1261  			}
  1262  			if azRemaining {
  1263  				retryMsg = fmt.Sprintf(
  1264  					"failed to start machine %s in zone %q, retrying in %v with new availability zone: %s",
  1265  					machine, startInstanceParams.AvailabilityZone,
  1266  					task.retryStartInstanceStrategy.retryDelay, err,
  1267  				)
  1268  				logger.Debugf("%s", retryMsg)
  1269  				// There's still more zones to try, so don't decrement "attemptsLeft" yet.
  1270  				retrying = false
  1271  			} else {
  1272  				// All availability zones have been attempted for this iteration,
  1273  				// clear the failures for the next time around. A given zone may
  1274  				// succeed after a prior failure.
  1275  				task.clearMachineAZFailures(machine)
  1276  			}
  1277  		}
  1278  		if retrying {
  1279  			retryMsg = fmt.Sprintf(
  1280  				"failed to start machine %s (%s), retrying in %v (%d more attempts)",
  1281  				machine, err.Error(), task.retryStartInstanceStrategy.retryDelay, attemptsLeft,
  1282  			)
  1283  			logger.Warningf("%s", retryMsg)
  1284  			attemptsLeft--
  1285  		}
  1286  
  1287  		if err3 := machine.SetInstanceStatus(status.Provisioning, retryMsg, nil); err3 != nil {
  1288  			logger.Warningf("failed to set instance status: %v", err3)
  1289  		}
  1290  
  1291  		select {
  1292  		case <-task.catacomb.Dying():
  1293  			return task.catacomb.ErrDying()
  1294  		case <-time.After(task.retryStartInstanceStrategy.retryDelay):
  1295  		}
  1296  	}
  1297  
  1298  	networkConfig := networkingcommon.NetworkConfigFromInterfaceInfo(result.NetworkInfo)
  1299  	volumes := volumesToAPIServer(result.Volumes)
  1300  	volumeNameToAttachmentInfo := volumeAttachmentsToAPIServer(result.VolumeAttachments)
  1301  
  1302  	// gather the charm LXD profile names, including the lxd profile names from
  1303  	// the container brokers.
  1304  	charmLXDProfiles := task.gatherCharmLXDProfiles(
  1305  		string(result.Instance.Id()),
  1306  		machine.Tag().Id(),
  1307  		startInstanceParams.CharmLXDProfiles,
  1308  	)
  1309  
  1310  	if err := machine.SetInstanceInfo(
  1311  		result.Instance.Id(),
  1312  		result.DisplayName,
  1313  		startInstanceParams.InstanceConfig.MachineNonce,
  1314  		result.Hardware,
  1315  		networkConfig,
  1316  		volumes,
  1317  		volumeNameToAttachmentInfo,
  1318  		charmLXDProfiles,
  1319  	); err != nil {
  1320  		// We need to stop the instance right away here, set error status and go on.
  1321  		if err2 := task.setErrorStatus("cannot register instance for machine %v: %v", machine, err); err2 != nil {
  1322  			logger.Errorf("%v", errors.Annotate(err2, "cannot set machine's status"))
  1323  		}
  1324  		if err2 := task.broker.StopInstances(task.cloudCallCtx, result.Instance.Id()); err2 != nil {
  1325  			logger.Errorf("%v", errors.Annotate(err2, "after failing to set instance info"))
  1326  		}
  1327  		return errors.Annotate(err, "cannot set instance info")
  1328  	}
  1329  
  1330  	logger.Infof(
  1331  		"started machine %s as instance %s with hardware %q, network config %+v, "+
  1332  			"volumes %v, volume attachments %v, subnets to zones %v, lxd profiles %v",
  1333  		machine,
  1334  		result.Instance.Id(),
  1335  		result.Hardware,
  1336  		networkConfig,
  1337  		volumes,
  1338  		volumeNameToAttachmentInfo,
  1339  		startInstanceParams.SubnetsToZones,
  1340  		startInstanceParams.CharmLXDProfiles,
  1341  	)
  1342  	return nil
  1343  }
  1344  
  1345  // gatherCharmLXDProfiles consumes the charms LXD Profiles from the different
  1346  // sources. This includes getting the information from the broker.
  1347  func (task *provisionerTask) gatherCharmLXDProfiles(instanceId, machineTag string, machineProfiles []string) []string {
  1348  	if names.IsContainerMachine(machineTag) {
  1349  		if manager, ok := task.broker.(container.LXDProfileNameRetriever); ok {
  1350  			if profileNames, err := manager.LXDProfileNames(instanceId); err == nil {
  1351  				return lxdprofile.LXDProfileNames(profileNames)
  1352  			}
  1353  		} else {
  1354  			logger.Tracef("failed to gather profile names, broker didn't conform to LXDProfileNameRetriever")
  1355  		}
  1356  	}
  1357  	return machineProfiles
  1358  }
  1359  
  1360  // markMachineFailedInAZ moves the machine in zone from MachineIds to FailedMachineIds
  1361  // in availabilityZoneMachines, report if there are any availability zones not failed for
  1362  // the specified machine.
  1363  func (task *provisionerTask) markMachineFailedInAZ(machine apiprovisioner.MachineProvisioner, zone string) (bool, error) {
  1364  	if zone == "" {
  1365  		return false, errors.New("no zone provided")
  1366  	}
  1367  	task.machinesMutex.Lock()
  1368  	defer task.machinesMutex.Unlock()
  1369  	azRemaining := false
  1370  	for _, zoneMachines := range task.availabilityZoneMachines {
  1371  		if zone == zoneMachines.ZoneName {
  1372  			zoneMachines.MachineIds.Remove(machine.Id())
  1373  			zoneMachines.FailedMachineIds.Add(machine.Id())
  1374  			if azRemaining {
  1375  				break
  1376  			}
  1377  		}
  1378  		if !zoneMachines.FailedMachineIds.Contains(machine.Id()) &&
  1379  			!zoneMachines.ExcludedMachineIds.Contains(machine.Id()) {
  1380  			azRemaining = true
  1381  		}
  1382  	}
  1383  	return azRemaining, nil
  1384  }
  1385  
  1386  func (task *provisionerTask) clearMachineAZFailures(machine apiprovisioner.MachineProvisioner) {
  1387  	task.machinesMutex.Lock()
  1388  	defer task.machinesMutex.Unlock()
  1389  	for _, zoneMachines := range task.availabilityZoneMachines {
  1390  		zoneMachines.FailedMachineIds.Remove(machine.Id())
  1391  	}
  1392  }
  1393  
  1394  func (task *provisionerTask) addMachineToAZMap(machine *apiprovisioner.Machine, zoneName string) {
  1395  	task.machinesMutex.Lock()
  1396  	defer task.machinesMutex.Unlock()
  1397  	for _, zoneMachines := range task.availabilityZoneMachines {
  1398  		if zoneName == zoneMachines.ZoneName {
  1399  			zoneMachines.MachineIds.Add(machine.Id())
  1400  			break
  1401  		}
  1402  	}
  1403  	return
  1404  }
  1405  
  1406  // removeMachineFromAZMap removes the specified machine from availabilityZoneMachines.
  1407  // It is assumed this is called when the machines are being deleted from state, or failed
  1408  // provisioning.
  1409  func (task *provisionerTask) removeMachineFromAZMap(machine apiprovisioner.MachineProvisioner) {
  1410  	machineId := machine.Id()
  1411  	task.machinesMutex.Lock()
  1412  	defer task.machinesMutex.Unlock()
  1413  	for _, zoneMachines := range task.availabilityZoneMachines {
  1414  		zoneMachines.MachineIds.Remove(machineId)
  1415  		zoneMachines.FailedMachineIds.Remove(machineId)
  1416  	}
  1417  }
  1418  
  1419  type provisioningInfo struct {
  1420  	Constraints    constraints.Value
  1421  	Series         string
  1422  	Placement      string
  1423  	InstanceConfig *instancecfg.InstanceConfig
  1424  	SubnetsToZones map[string][]string
  1425  }
  1426  
  1427  func assocProvInfoAndMachCfg(
  1428  	provInfo *params.ProvisioningInfo,
  1429  	instanceConfig *instancecfg.InstanceConfig,
  1430  ) *provisioningInfo {
  1431  	return &provisioningInfo{
  1432  		Constraints:    provInfo.Constraints,
  1433  		Series:         provInfo.Series,
  1434  		Placement:      provInfo.Placement,
  1435  		InstanceConfig: instanceConfig,
  1436  		SubnetsToZones: provInfo.SubnetsToZones,
  1437  	}
  1438  }
  1439  
  1440  func volumesToAPIServer(volumes []storage.Volume) []params.Volume {
  1441  	result := make([]params.Volume, len(volumes))
  1442  	for i, v := range volumes {
  1443  		result[i] = params.Volume{
  1444  			VolumeTag: v.Tag.String(),
  1445  			Info: params.VolumeInfo{
  1446  				VolumeId:   v.VolumeId,
  1447  				HardwareId: v.HardwareId,
  1448  				WWN:        v.WWN, // pool
  1449  				Size:       v.Size,
  1450  				Persistent: v.Persistent,
  1451  			},
  1452  		}
  1453  	}
  1454  	return result
  1455  }
  1456  
  1457  func volumeAttachmentsToAPIServer(attachments []storage.VolumeAttachment) map[string]params.VolumeAttachmentInfo {
  1458  	result := make(map[string]params.VolumeAttachmentInfo)
  1459  	for _, a := range attachments {
  1460  		var planInfo *params.VolumeAttachmentPlanInfo
  1461  		if a.PlanInfo != nil {
  1462  			planInfo.DeviceType = a.PlanInfo.DeviceType
  1463  			planInfo.DeviceAttributes = a.PlanInfo.DeviceAttributes
  1464  		}
  1465  		result[a.Volume.String()] = params.VolumeAttachmentInfo{
  1466  			DeviceName: a.DeviceName,
  1467  			DeviceLink: a.DeviceLink,
  1468  			BusAddress: a.BusAddress,
  1469  			ReadOnly:   a.ReadOnly,
  1470  			PlanInfo:   planInfo,
  1471  		}
  1472  	}
  1473  	return result
  1474  }