github.com/mattyw/juju@v0.0.0-20140610034352-732aecd63861/worker/provisioner/provisioner_task.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package provisioner
     5  
     6  import (
     7  	"fmt"
     8  	"time"
     9  
    10  	"github.com/juju/names"
    11  	"github.com/juju/utils"
    12  	"github.com/juju/utils/set"
    13  	"launchpad.net/tomb"
    14  
    15  	"github.com/juju/juju/constraints"
    16  	"github.com/juju/juju/environs"
    17  	"github.com/juju/juju/environs/cloudinit"
    18  	"github.com/juju/juju/environs/network"
    19  	"github.com/juju/juju/environs/tools"
    20  	"github.com/juju/juju/instance"
    21  	"github.com/juju/juju/state/api/params"
    22  	apiprovisioner "github.com/juju/juju/state/api/provisioner"
    23  	apiwatcher "github.com/juju/juju/state/api/watcher"
    24  	"github.com/juju/juju/state/watcher"
    25  	coretools "github.com/juju/juju/tools"
    26  	"github.com/juju/juju/worker"
    27  )
    28  
    29  type ProvisionerTask interface {
    30  	worker.Worker
    31  	Stop() error
    32  	Dying() <-chan struct{}
    33  	Err() error
    34  
    35  	// SetSafeMode sets a flag to indicate whether the provisioner task
    36  	// runs in safe mode or not. In safe mode, any running instances
    37  	// which do no exist in state are allowed to keep running rather than
    38  	// being shut down.
    39  	SetSafeMode(safeMode bool)
    40  }
    41  
    42  type MachineGetter interface {
    43  	Machine(tag string) (*apiprovisioner.Machine, error)
    44  	MachinesWithTransientErrors() ([]*apiprovisioner.Machine, []params.StatusResult, error)
    45  }
    46  
    47  var _ MachineGetter = (*apiprovisioner.State)(nil)
    48  
    49  func NewProvisionerTask(
    50  	machineTag string,
    51  	safeMode bool,
    52  	machineGetter MachineGetter,
    53  	machineWatcher apiwatcher.StringsWatcher,
    54  	retryWatcher apiwatcher.NotifyWatcher,
    55  	broker environs.InstanceBroker,
    56  	auth environs.AuthenticationProvider,
    57  ) ProvisionerTask {
    58  	task := &provisionerTask{
    59  		machineTag:     machineTag,
    60  		machineGetter:  machineGetter,
    61  		machineWatcher: machineWatcher,
    62  		retryWatcher:   retryWatcher,
    63  		broker:         broker,
    64  		auth:           auth,
    65  		safeMode:       safeMode,
    66  		safeModeChan:   make(chan bool, 1),
    67  		machines:       make(map[string]*apiprovisioner.Machine),
    68  	}
    69  	go func() {
    70  		defer task.tomb.Done()
    71  		task.tomb.Kill(task.loop())
    72  	}()
    73  	return task
    74  }
    75  
    76  type provisionerTask struct {
    77  	machineTag     string
    78  	machineGetter  MachineGetter
    79  	machineWatcher apiwatcher.StringsWatcher
    80  	retryWatcher   apiwatcher.NotifyWatcher
    81  	broker         environs.InstanceBroker
    82  	tomb           tomb.Tomb
    83  	auth           environs.AuthenticationProvider
    84  
    85  	safeMode     bool
    86  	safeModeChan chan bool
    87  
    88  	// instance id -> instance
    89  	instances map[instance.Id]instance.Instance
    90  	// machine id -> machine
    91  	machines map[string]*apiprovisioner.Machine
    92  }
    93  
    94  // Kill implements worker.Worker.Kill.
    95  func (task *provisionerTask) Kill() {
    96  	task.tomb.Kill(nil)
    97  }
    98  
    99  // Wait implements worker.Worker.Wait.
   100  func (task *provisionerTask) Wait() error {
   101  	return task.tomb.Wait()
   102  }
   103  
   104  func (task *provisionerTask) Stop() error {
   105  	task.Kill()
   106  	return task.Wait()
   107  }
   108  
   109  func (task *provisionerTask) Dying() <-chan struct{} {
   110  	return task.tomb.Dying()
   111  }
   112  
   113  func (task *provisionerTask) Err() error {
   114  	return task.tomb.Err()
   115  }
   116  
   117  func (task *provisionerTask) loop() error {
   118  	logger.Infof("Starting up provisioner task %s", task.machineTag)
   119  	defer watcher.Stop(task.machineWatcher, &task.tomb)
   120  
   121  	// Don't allow the safe mode to change until we have
   122  	// read at least one set of changes, which will populate
   123  	// the task.machines map. Otherwise we will potentially
   124  	// see all legitimate instances as unknown.
   125  	var safeModeChan chan bool
   126  
   127  	// Not all provisioners have a retry channel.
   128  	var retryChan <-chan struct{}
   129  	if task.retryWatcher != nil {
   130  		retryChan = task.retryWatcher.Changes()
   131  	}
   132  
   133  	// When the watcher is started, it will have the initial changes be all
   134  	// the machines that are relevant. Also, since this is available straight
   135  	// away, we know there will be some changes right off the bat.
   136  	for {
   137  		select {
   138  		case <-task.tomb.Dying():
   139  			logger.Infof("Shutting down provisioner task %s", task.machineTag)
   140  			return tomb.ErrDying
   141  		case ids, ok := <-task.machineWatcher.Changes():
   142  			if !ok {
   143  				return watcher.MustErr(task.machineWatcher)
   144  			}
   145  			if err := task.processMachines(ids); err != nil {
   146  				return fmt.Errorf("failed to process updated machines: %v", err)
   147  			}
   148  			// We've seen a set of changes. Enable safe mode change.
   149  			safeModeChan = task.safeModeChan
   150  		case safeMode := <-safeModeChan:
   151  			if safeMode == task.safeMode {
   152  				break
   153  			}
   154  			logger.Infof("safe mode changed to %v", safeMode)
   155  			task.safeMode = safeMode
   156  			if !safeMode {
   157  				// Safe mode has been disabled, so process current machines
   158  				// so that unknown machines will be immediately dealt with.
   159  				if err := task.processMachines(nil); err != nil {
   160  					return fmt.Errorf("failed to process machines after safe mode disabled: %v", err)
   161  				}
   162  			}
   163  		case <-retryChan:
   164  			if err := task.processMachinesWithTransientErrors(); err != nil {
   165  				return fmt.Errorf("failed to process machines with transient errors: %v", err)
   166  			}
   167  		}
   168  	}
   169  }
   170  
   171  // SetSafeMode implements ProvisionerTask.SetSafeMode().
   172  func (task *provisionerTask) SetSafeMode(safeMode bool) {
   173  	select {
   174  	case task.safeModeChan <- safeMode:
   175  	case <-task.Dying():
   176  	}
   177  }
   178  
   179  func (task *provisionerTask) processMachinesWithTransientErrors() error {
   180  	machines, statusResults, err := task.machineGetter.MachinesWithTransientErrors()
   181  	if err != nil {
   182  		return nil
   183  	}
   184  	logger.Tracef("processMachinesWithTransientErrors(%v)", statusResults)
   185  	var pending []*apiprovisioner.Machine
   186  	for i, status := range statusResults {
   187  		if status.Error != nil {
   188  			logger.Errorf("cannot retry provisioning of machine %q: %v", status.Id, status.Error)
   189  			continue
   190  		}
   191  		machine := machines[i]
   192  		if err := machine.SetStatus(params.StatusPending, "", nil); err != nil {
   193  			logger.Errorf("cannot reset status of machine %q: %v", status.Id, err)
   194  			continue
   195  		}
   196  		task.machines[machine.Tag()] = machine
   197  		pending = append(pending, machine)
   198  	}
   199  	return task.startMachines(pending)
   200  }
   201  
   202  func (task *provisionerTask) processMachines(ids []string) error {
   203  	logger.Tracef("processMachines(%v)", ids)
   204  	// Populate the tasks maps of current instances and machines.
   205  	err := task.populateMachineMaps(ids)
   206  	if err != nil {
   207  		return err
   208  	}
   209  
   210  	// Find machines without an instance id or that are dead
   211  	pending, dead, err := task.pendingOrDead(ids)
   212  	if err != nil {
   213  		return err
   214  	}
   215  
   216  	// Stop all machines that are dead
   217  	stopping := task.instancesForMachines(dead)
   218  
   219  	// Find running instances that have no machines associated
   220  	unknown, err := task.findUnknownInstances(stopping)
   221  	if err != nil {
   222  		return err
   223  	}
   224  	if task.safeMode {
   225  		logger.Infof("running in safe mode, unknown instances not stopped %v", instanceIds(unknown))
   226  		unknown = nil
   227  	}
   228  	if len(stopping) > 0 {
   229  		logger.Infof("stopping known instances %v", stopping)
   230  	}
   231  	if len(unknown) > 0 {
   232  		logger.Infof("stopping unknown instances %v", instanceIds(unknown))
   233  	}
   234  	// It's important that we stop unknown instances before starting
   235  	// pending ones, because if we start an instance and then fail to
   236  	// set its InstanceId on the machine we don't want to start a new
   237  	// instance for the same machine ID.
   238  	if err := task.stopInstances(append(stopping, unknown...)); err != nil {
   239  		return err
   240  	}
   241  
   242  	// Remove any dead machines from state.
   243  	for _, machine := range dead {
   244  		logger.Infof("removing dead machine %q", machine)
   245  		if err := machine.Remove(); err != nil {
   246  			logger.Errorf("failed to remove dead machine %q", machine)
   247  		}
   248  		delete(task.machines, machine.Id())
   249  	}
   250  
   251  	// Start an instance for the pending ones
   252  	return task.startMachines(pending)
   253  }
   254  
   255  func instanceIds(instances []instance.Instance) []string {
   256  	ids := make([]string, 0, len(instances))
   257  	for _, inst := range instances {
   258  		ids = append(ids, string(inst.Id()))
   259  	}
   260  	return ids
   261  }
   262  
   263  func (task *provisionerTask) populateMachineMaps(ids []string) error {
   264  	task.instances = make(map[instance.Id]instance.Instance)
   265  
   266  	instances, err := task.broker.AllInstances()
   267  	if err != nil {
   268  		logger.Errorf("failed to get all instances from broker: %v", err)
   269  		return err
   270  	}
   271  	for _, i := range instances {
   272  		task.instances[i.Id()] = i
   273  	}
   274  
   275  	// Update the machines map with new data for each of the machines in the
   276  	// change list.
   277  	// TODO(thumper): update for API server later to get all machines in one go.
   278  	for _, id := range ids {
   279  		machineTag := names.MachineTag(id)
   280  		machine, err := task.machineGetter.Machine(machineTag)
   281  		switch {
   282  		case params.IsCodeNotFoundOrCodeUnauthorized(err):
   283  			logger.Debugf("machine %q not found in state", id)
   284  			delete(task.machines, id)
   285  		case err == nil:
   286  			task.machines[id] = machine
   287  		default:
   288  			logger.Errorf("failed to get machine: %v", err)
   289  		}
   290  	}
   291  	return nil
   292  }
   293  
   294  // pendingOrDead looks up machines with ids and returns those that do not
   295  // have an instance id assigned yet, and also those that are dead.
   296  func (task *provisionerTask) pendingOrDead(ids []string) (pending, dead []*apiprovisioner.Machine, err error) {
   297  	for _, id := range ids {
   298  		machine, found := task.machines[id]
   299  		if !found {
   300  			logger.Infof("machine %q not found", id)
   301  			continue
   302  		}
   303  		switch machine.Life() {
   304  		case params.Dying:
   305  			if _, err := machine.InstanceId(); err == nil {
   306  				continue
   307  			} else if !params.IsCodeNotProvisioned(err) {
   308  				logger.Errorf("failed to load machine %q instance id: %v", machine, err)
   309  				return nil, nil, err
   310  			}
   311  			logger.Infof("killing dying, unprovisioned machine %q", machine)
   312  			if err := machine.EnsureDead(); err != nil {
   313  				logger.Errorf("failed to ensure machine dead %q: %v", machine, err)
   314  				return nil, nil, err
   315  			}
   316  			fallthrough
   317  		case params.Dead:
   318  			dead = append(dead, machine)
   319  			continue
   320  		}
   321  		if instId, err := machine.InstanceId(); err != nil {
   322  			if !params.IsCodeNotProvisioned(err) {
   323  				logger.Errorf("failed to load machine %q instance id: %v", machine, err)
   324  				continue
   325  			}
   326  			status, _, err := machine.Status()
   327  			if err != nil {
   328  				logger.Infof("cannot get machine %q status: %v", machine, err)
   329  				continue
   330  			}
   331  			if status == params.StatusPending {
   332  				pending = append(pending, machine)
   333  				logger.Infof("found machine %q pending provisioning", machine)
   334  				continue
   335  			}
   336  		} else {
   337  			logger.Infof("machine %v already started as instance %q", machine, instId)
   338  		}
   339  	}
   340  	logger.Tracef("pending machines: %v", pending)
   341  	logger.Tracef("dead machines: %v", dead)
   342  	return
   343  }
   344  
   345  // findUnknownInstances finds instances which are not associated with a machine.
   346  func (task *provisionerTask) findUnknownInstances(stopping []instance.Instance) ([]instance.Instance, error) {
   347  	// Make a copy of the instances we know about.
   348  	instances := make(map[instance.Id]instance.Instance)
   349  	for k, v := range task.instances {
   350  		instances[k] = v
   351  	}
   352  
   353  	for _, m := range task.machines {
   354  		instId, err := m.InstanceId()
   355  		switch {
   356  		case err == nil:
   357  			delete(instances, instId)
   358  		case params.IsCodeNotProvisioned(err):
   359  		case params.IsCodeNotFoundOrCodeUnauthorized(err):
   360  		default:
   361  			return nil, err
   362  		}
   363  	}
   364  	// Now remove all those instances that we are stopping already as we
   365  	// know about those and don't want to include them in the unknown list.
   366  	for _, inst := range stopping {
   367  		delete(instances, inst.Id())
   368  	}
   369  	var unknown []instance.Instance
   370  	for _, inst := range instances {
   371  		unknown = append(unknown, inst)
   372  	}
   373  	return unknown, nil
   374  }
   375  
   376  // instancesForMachines returns a list of instance.Instance that represent
   377  // the list of machines running in the provider. Missing machines are
   378  // omitted from the list.
   379  func (task *provisionerTask) instancesForMachines(machines []*apiprovisioner.Machine) []instance.Instance {
   380  	var instances []instance.Instance
   381  	for _, machine := range machines {
   382  		instId, err := machine.InstanceId()
   383  		if err == nil {
   384  			instance, found := task.instances[instId]
   385  			// If the instance is not found we can't stop it.
   386  			if found {
   387  				instances = append(instances, instance)
   388  			}
   389  		}
   390  	}
   391  	return instances
   392  }
   393  
   394  func (task *provisionerTask) stopInstances(instances []instance.Instance) error {
   395  	// Although calling StopInstance with an empty slice should produce no change in the
   396  	// provider, environs like dummy do not consider this a noop.
   397  	if len(instances) == 0 {
   398  		return nil
   399  	}
   400  	ids := make([]instance.Id, len(instances))
   401  	for i, inst := range instances {
   402  		ids[i] = inst.Id()
   403  	}
   404  	if err := task.broker.StopInstances(ids...); err != nil {
   405  		logger.Errorf("broker failed to stop instances: %v", err)
   406  		return err
   407  	}
   408  	return nil
   409  }
   410  
   411  func (task *provisionerTask) startMachines(machines []*apiprovisioner.Machine) error {
   412  	for _, m := range machines {
   413  		if err := task.startMachine(m); err != nil {
   414  			return fmt.Errorf("cannot start machine %v: %v", m, err)
   415  		}
   416  	}
   417  	return nil
   418  }
   419  
   420  func (task *provisionerTask) setErrorStatus(message string, machine *apiprovisioner.Machine, err error) error {
   421  	logger.Errorf(message, machine, err)
   422  	if err1 := machine.SetStatus(params.StatusError, err.Error(), nil); err1 != nil {
   423  		// Something is wrong with this machine, better report it back.
   424  		logger.Errorf("cannot set error status for machine %q: %v", machine, err1)
   425  		return err1
   426  	}
   427  	return nil
   428  }
   429  
   430  func (task *provisionerTask) prepareNetworkAndInterfaces(networkInfo []network.Info) (
   431  	networks []params.Network, ifaces []params.NetworkInterface) {
   432  	if len(networkInfo) == 0 {
   433  		return nil, nil
   434  	}
   435  	visitedNetworks := set.NewStrings()
   436  	for _, info := range networkInfo {
   437  		networkTag := names.NetworkTag(info.NetworkName)
   438  		if !visitedNetworks.Contains(networkTag) {
   439  			networks = append(networks, params.Network{
   440  				Tag:        networkTag,
   441  				ProviderId: info.ProviderId,
   442  				CIDR:       info.CIDR,
   443  				VLANTag:    info.VLANTag,
   444  			})
   445  			visitedNetworks.Add(networkTag)
   446  		}
   447  		ifaces = append(ifaces, params.NetworkInterface{
   448  			InterfaceName: info.InterfaceName,
   449  			MACAddress:    info.MACAddress,
   450  			NetworkTag:    networkTag,
   451  			IsVirtual:     info.IsVirtual,
   452  		})
   453  	}
   454  	return networks, ifaces
   455  }
   456  
   457  func (task *provisionerTask) startMachine(machine *apiprovisioner.Machine) error {
   458  	provisioningInfo, err := task.provisioningInfo(machine)
   459  	if err != nil {
   460  		return err
   461  	}
   462  	possibleTools, err := task.possibleTools(provisioningInfo.Series, provisioningInfo.Constraints)
   463  	if err != nil {
   464  		return task.setErrorStatus("cannot find tools for machine %q: %v", machine, err)
   465  	}
   466  	inst, metadata, networkInfo, err := task.broker.StartInstance(environs.StartInstanceParams{
   467  		Constraints:       provisioningInfo.Constraints,
   468  		Tools:             possibleTools,
   469  		MachineConfig:     provisioningInfo.MachineConfig,
   470  		Placement:         provisioningInfo.Placement,
   471  		DistributionGroup: machine.DistributionGroup,
   472  	})
   473  	if err != nil {
   474  		// Set the state to error, so the machine will be skipped next
   475  		// time until the error is resolved, but don't return an
   476  		// error; just keep going with the other machines.
   477  		return task.setErrorStatus("cannot start instance for machine %q: %v", machine, err)
   478  	}
   479  	nonce := provisioningInfo.MachineConfig.MachineNonce
   480  	networks, ifaces := task.prepareNetworkAndInterfaces(networkInfo)
   481  
   482  	err = machine.SetInstanceInfo(inst.Id(), nonce, metadata, networks, ifaces)
   483  	if err != nil && params.IsCodeNotImplemented(err) {
   484  		return fmt.Errorf("cannot provision instance %v for machine %q with networks: not implemented", inst.Id(), machine)
   485  	} else if err == nil {
   486  		logger.Infof("started machine %s as instance %s with hardware %q, networks %v, interfaces %v", machine, inst.Id(), metadata, networks, ifaces)
   487  		return nil
   488  	}
   489  	// We need to stop the instance right away here, set error status and go on.
   490  	task.setErrorStatus("cannot register instance for machine %v: %v", machine, err)
   491  	if err := task.broker.StopInstances(inst.Id()); err != nil {
   492  		// We cannot even stop the instance, log the error and quit.
   493  		logger.Errorf("cannot stop instance %q for machine %v: %v", inst.Id(), machine, err)
   494  		return err
   495  	}
   496  	return nil
   497  }
   498  
   499  func (task *provisionerTask) possibleTools(series string, cons constraints.Value) (coretools.List, error) {
   500  	if env, ok := task.broker.(environs.Environ); ok {
   501  		agentVersion, ok := env.Config().AgentVersion()
   502  		if !ok {
   503  			return nil, fmt.Errorf("no agent version set in environment configuration")
   504  		}
   505  		return tools.FindInstanceTools(env, agentVersion, series, cons.Arch)
   506  	}
   507  	if hasTools, ok := task.broker.(coretools.HasTools); ok {
   508  		return hasTools.Tools(series), nil
   509  	}
   510  	panic(fmt.Errorf("broker of type %T does not provide any tools", task.broker))
   511  }
   512  
   513  type provisioningInfo struct {
   514  	Constraints   constraints.Value
   515  	Series        string
   516  	Placement     string
   517  	MachineConfig *cloudinit.MachineConfig
   518  }
   519  
   520  func (task *provisionerTask) provisioningInfo(machine *apiprovisioner.Machine) (*provisioningInfo, error) {
   521  	stateInfo, apiInfo, err := task.auth.SetupAuthentication(machine)
   522  	if err != nil {
   523  		logger.Errorf("failed to setup authentication: %v", err)
   524  		return nil, err
   525  	}
   526  	// Generated a nonce for the new instance, with the format: "machine-#:UUID".
   527  	// The first part is a badge, specifying the tag of the machine the provisioner
   528  	// is running on, while the second part is a random UUID.
   529  	uuid, err := utils.NewUUID()
   530  	if err != nil {
   531  		return nil, err
   532  	}
   533  	// ProvisioningInfo is new in 1.20; wait for the API server to be upgraded
   534  	// so we don't spew errors on upgrade.
   535  	var pInfo *params.ProvisioningInfo
   536  	for {
   537  		if pInfo, err = machine.ProvisioningInfo(); err == nil {
   538  			break
   539  		}
   540  		if params.IsCodeNotImplemented(err) {
   541  			logger.Infof("waiting for state server to be upgraded")
   542  			select {
   543  			case <-task.tomb.Dying():
   544  				return nil, tomb.ErrDying
   545  			case <-time.After(15 * time.Second):
   546  				continue
   547  			}
   548  		}
   549  		return nil, err
   550  	}
   551  	nonce := fmt.Sprintf("%s:%s", task.machineTag, uuid.String())
   552  	machineConfig := environs.NewMachineConfig(machine.Id(), nonce, pInfo.Networks, stateInfo, apiInfo)
   553  	return &provisioningInfo{
   554  		Constraints:   pInfo.Constraints,
   555  		Series:        pInfo.Series,
   556  		Placement:     pInfo.Placement,
   557  		MachineConfig: machineConfig,
   558  	}, nil
   559  }