github.com/cloudbase/juju-core@v0.0.0-20140504232958-a7271ac7912f/worker/provisioner/provisioner_task.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package provisioner
     5  
     6  import (
     7  	"fmt"
     8  
     9  	"launchpad.net/tomb"
    10  
    11  	"launchpad.net/juju-core/constraints"
    12  	"launchpad.net/juju-core/environs"
    13  	"launchpad.net/juju-core/environs/cloudinit"
    14  	"launchpad.net/juju-core/environs/tools"
    15  	"launchpad.net/juju-core/instance"
    16  	"launchpad.net/juju-core/names"
    17  	"launchpad.net/juju-core/state/api/params"
    18  	apiprovisioner "launchpad.net/juju-core/state/api/provisioner"
    19  	"launchpad.net/juju-core/state/watcher"
    20  	coretools "launchpad.net/juju-core/tools"
    21  	"launchpad.net/juju-core/utils"
    22  	"launchpad.net/juju-core/worker"
    23  )
    24  
    25  type ProvisionerTask interface {
    26  	worker.Worker
    27  	Stop() error
    28  	Dying() <-chan struct{}
    29  	Err() error
    30  
    31  	// SetSafeMode sets a flag to indicate whether the provisioner task
    32  	// runs in safe mode or not. In safe mode, any running instances
    33  	// which do no exist in state are allowed to keep running rather than
    34  	// being shut down.
    35  	SetSafeMode(safeMode bool)
    36  }
    37  
    38  type Watcher interface {
    39  	watcher.Errer
    40  	watcher.Stopper
    41  	Changes() <-chan []string
    42  }
    43  
    44  type MachineGetter interface {
    45  	Machine(tag string) (*apiprovisioner.Machine, error)
    46  }
    47  
    48  func NewProvisionerTask(
    49  	machineTag string,
    50  	safeMode bool,
    51  	machineGetter MachineGetter,
    52  	watcher Watcher,
    53  	broker environs.InstanceBroker,
    54  	auth environs.AuthenticationProvider,
    55  ) ProvisionerTask {
    56  	task := &provisionerTask{
    57  		machineTag:     machineTag,
    58  		machineGetter:  machineGetter,
    59  		machineWatcher: watcher,
    60  		broker:         broker,
    61  		auth:           auth,
    62  		safeMode:       safeMode,
    63  		safeModeChan:   make(chan bool, 1),
    64  		machines:       make(map[string]*apiprovisioner.Machine),
    65  	}
    66  	go func() {
    67  		defer task.tomb.Done()
    68  		task.tomb.Kill(task.loop())
    69  	}()
    70  	return task
    71  }
    72  
    73  type provisionerTask struct {
    74  	machineTag     string
    75  	machineGetter  MachineGetter
    76  	machineWatcher Watcher
    77  	broker         environs.InstanceBroker
    78  	tomb           tomb.Tomb
    79  	auth           environs.AuthenticationProvider
    80  
    81  	safeMode     bool
    82  	safeModeChan chan bool
    83  
    84  	// instance id -> instance
    85  	instances map[instance.Id]instance.Instance
    86  	// machine id -> machine
    87  	machines map[string]*apiprovisioner.Machine
    88  }
    89  
    90  // Kill implements worker.Worker.Kill.
    91  func (task *provisionerTask) Kill() {
    92  	task.tomb.Kill(nil)
    93  }
    94  
    95  // Wait implements worker.Worker.Wait.
    96  func (task *provisionerTask) Wait() error {
    97  	return task.tomb.Wait()
    98  }
    99  
   100  func (task *provisionerTask) Stop() error {
   101  	task.Kill()
   102  	return task.Wait()
   103  }
   104  
   105  func (task *provisionerTask) Dying() <-chan struct{} {
   106  	return task.tomb.Dying()
   107  }
   108  
   109  func (task *provisionerTask) Err() error {
   110  	return task.tomb.Err()
   111  }
   112  
   113  func (task *provisionerTask) loop() error {
   114  	logger.Infof("Starting up provisioner task %s", task.machineTag)
   115  	defer watcher.Stop(task.machineWatcher, &task.tomb)
   116  
   117  	// Don't allow the safe mode to change until we have
   118  	// read at least one set of changes, which will populate
   119  	// the task.machines map. Otherwise we will potentially
   120  	// see all legitimate instances as unknown.
   121  	var safeModeChan chan bool
   122  
   123  	// When the watcher is started, it will have the initial changes be all
   124  	// the machines that are relevant. Also, since this is available straight
   125  	// away, we know there will be some changes right off the bat.
   126  	for {
   127  		select {
   128  		case <-task.tomb.Dying():
   129  			logger.Infof("Shutting down provisioner task %s", task.machineTag)
   130  			return tomb.ErrDying
   131  		case ids, ok := <-task.machineWatcher.Changes():
   132  			if !ok {
   133  				return watcher.MustErr(task.machineWatcher)
   134  			}
   135  			// TODO(dfc; lp:1042717) fire process machines periodically to shut down unknown
   136  			// instances.
   137  			if err := task.processMachines(ids); err != nil {
   138  				return fmt.Errorf("failed to process updated machines: %v", err)
   139  			}
   140  			// We've seen a set of changes. Enable safe mode change.
   141  			safeModeChan = task.safeModeChan
   142  		case safeMode := <-safeModeChan:
   143  			if safeMode == task.safeMode {
   144  				break
   145  			}
   146  			logger.Infof("safe mode changed to %v", safeMode)
   147  			task.safeMode = safeMode
   148  			if !safeMode {
   149  				// Safe mode has been disabled, so process current machines
   150  				// so that unknown machines will be immediately dealt with.
   151  				if err := task.processMachines(nil); err != nil {
   152  					return fmt.Errorf("failed to process machines after safe mode disabled: %v", err)
   153  				}
   154  			}
   155  		}
   156  	}
   157  }
   158  
   159  // SetSafeMode implements ProvisionerTask.SetSafeMode().
   160  func (task *provisionerTask) SetSafeMode(safeMode bool) {
   161  	select {
   162  	case task.safeModeChan <- safeMode:
   163  	case <-task.Dying():
   164  	}
   165  }
   166  
   167  func (task *provisionerTask) processMachines(ids []string) error {
   168  	logger.Tracef("processMachines(%v)", ids)
   169  	// Populate the tasks maps of current instances and machines.
   170  	err := task.populateMachineMaps(ids)
   171  	if err != nil {
   172  		return err
   173  	}
   174  
   175  	// Find machines without an instance id or that are dead
   176  	pending, dead, err := task.pendingOrDead(ids)
   177  	if err != nil {
   178  		return err
   179  	}
   180  
   181  	// Stop all machines that are dead
   182  	stopping := task.instancesForMachines(dead)
   183  
   184  	// Find running instances that have no machines associated
   185  	unknown, err := task.findUnknownInstances(stopping)
   186  	if err != nil {
   187  		return err
   188  	}
   189  	if task.safeMode {
   190  		logger.Infof("running in safe mode, unknown instances not stopped %v", instanceIds(unknown))
   191  		unknown = nil
   192  	}
   193  	if len(stopping) > 0 {
   194  		logger.Infof("stopping known instances %v", stopping)
   195  	}
   196  	if len(unknown) > 0 {
   197  		logger.Infof("stopping unknown instances %v", instanceIds(unknown))
   198  	}
   199  	// It's important that we stop unknown instances before starting
   200  	// pending ones, because if we start an instance and then fail to
   201  	// set its InstanceId on the machine we don't want to start a new
   202  	// instance for the same machine ID.
   203  	if err := task.stopInstances(append(stopping, unknown...)); err != nil {
   204  		return err
   205  	}
   206  
   207  	// Remove any dead machines from state.
   208  	for _, machine := range dead {
   209  		logger.Infof("removing dead machine %q", machine)
   210  		if err := machine.Remove(); err != nil {
   211  			logger.Errorf("failed to remove dead machine %q", machine)
   212  		}
   213  		delete(task.machines, machine.Id())
   214  	}
   215  
   216  	// Start an instance for the pending ones
   217  	return task.startMachines(pending)
   218  }
   219  
   220  func instanceIds(instances []instance.Instance) []string {
   221  	ids := make([]string, 0, len(instances))
   222  	for _, inst := range instances {
   223  		ids = append(ids, string(inst.Id()))
   224  	}
   225  	return ids
   226  }
   227  
   228  func (task *provisionerTask) populateMachineMaps(ids []string) error {
   229  	task.instances = make(map[instance.Id]instance.Instance)
   230  
   231  	instances, err := task.broker.AllInstances()
   232  	if err != nil {
   233  		logger.Errorf("failed to get all instances from broker: %v", err)
   234  		return err
   235  	}
   236  	for _, i := range instances {
   237  		task.instances[i.Id()] = i
   238  	}
   239  
   240  	// Update the machines map with new data for each of the machines in the
   241  	// change list.
   242  	// TODO(thumper): update for API server later to get all machines in one go.
   243  	for _, id := range ids {
   244  		machineTag := names.MachineTag(id)
   245  		machine, err := task.machineGetter.Machine(machineTag)
   246  		switch {
   247  		case params.IsCodeNotFoundOrCodeUnauthorized(err):
   248  			logger.Debugf("machine %q not found in state", id)
   249  			delete(task.machines, id)
   250  		case err == nil:
   251  			task.machines[id] = machine
   252  		default:
   253  			logger.Errorf("failed to get machine: %v", err)
   254  		}
   255  	}
   256  	return nil
   257  }
   258  
   259  // pendingOrDead looks up machines with ids and returns those that do not
   260  // have an instance id assigned yet, and also those that are dead.
   261  func (task *provisionerTask) pendingOrDead(ids []string) (pending, dead []*apiprovisioner.Machine, err error) {
   262  	for _, id := range ids {
   263  		machine, found := task.machines[id]
   264  		if !found {
   265  			logger.Infof("machine %q not found", id)
   266  			continue
   267  		}
   268  		switch machine.Life() {
   269  		case params.Dying:
   270  			if _, err := machine.InstanceId(); err == nil {
   271  				continue
   272  			} else if !params.IsCodeNotProvisioned(err) {
   273  				logger.Errorf("failed to load machine %q instance id: %v", machine, err)
   274  				return nil, nil, err
   275  			}
   276  			logger.Infof("killing dying, unprovisioned machine %q", machine)
   277  			if err := machine.EnsureDead(); err != nil {
   278  				logger.Errorf("failed to ensure machine dead %q: %v", machine, err)
   279  				return nil, nil, err
   280  			}
   281  			fallthrough
   282  		case params.Dead:
   283  			dead = append(dead, machine)
   284  			continue
   285  		}
   286  		if instId, err := machine.InstanceId(); err != nil {
   287  			if !params.IsCodeNotProvisioned(err) {
   288  				logger.Errorf("failed to load machine %q instance id: %v", machine, err)
   289  				continue
   290  			}
   291  			status, _, err := machine.Status()
   292  			if err != nil {
   293  				logger.Infof("cannot get machine %q status: %v", machine, err)
   294  				continue
   295  			}
   296  			if status == params.StatusPending {
   297  				pending = append(pending, machine)
   298  				logger.Infof("found machine %q pending provisioning", machine)
   299  				continue
   300  			}
   301  		} else {
   302  			logger.Infof("machine %v already started as instance %q", machine, instId)
   303  		}
   304  	}
   305  	logger.Tracef("pending machines: %v", pending)
   306  	logger.Tracef("dead machines: %v", dead)
   307  	return
   308  }
   309  
   310  // findUnknownInstances finds instances which are not associated with a machine.
   311  func (task *provisionerTask) findUnknownInstances(stopping []instance.Instance) ([]instance.Instance, error) {
   312  	// Make a copy of the instances we know about.
   313  	instances := make(map[instance.Id]instance.Instance)
   314  	for k, v := range task.instances {
   315  		instances[k] = v
   316  	}
   317  
   318  	for _, m := range task.machines {
   319  		instId, err := m.InstanceId()
   320  		switch {
   321  		case err == nil:
   322  			delete(instances, instId)
   323  		case params.IsCodeNotProvisioned(err):
   324  		case params.IsCodeNotFoundOrCodeUnauthorized(err):
   325  		default:
   326  			return nil, err
   327  		}
   328  	}
   329  	// Now remove all those instances that we are stopping already as we
   330  	// know about those and don't want to include them in the unknown list.
   331  	for _, inst := range stopping {
   332  		delete(instances, inst.Id())
   333  	}
   334  	var unknown []instance.Instance
   335  	for _, inst := range instances {
   336  		unknown = append(unknown, inst)
   337  	}
   338  	return unknown, nil
   339  }
   340  
   341  // instancesForMachines returns a list of instance.Instance that represent
   342  // the list of machines running in the provider. Missing machines are
   343  // omitted from the list.
   344  func (task *provisionerTask) instancesForMachines(machines []*apiprovisioner.Machine) []instance.Instance {
   345  	var instances []instance.Instance
   346  	for _, machine := range machines {
   347  		instId, err := machine.InstanceId()
   348  		if err == nil {
   349  			instance, found := task.instances[instId]
   350  			// If the instance is not found we can't stop it.
   351  			if found {
   352  				instances = append(instances, instance)
   353  			}
   354  		}
   355  	}
   356  	return instances
   357  }
   358  
   359  func (task *provisionerTask) stopInstances(instances []instance.Instance) error {
   360  	// Although calling StopInstance with an empty slice should produce no change in the
   361  	// provider, environs like dummy do not consider this a noop.
   362  	if len(instances) == 0 {
   363  		return nil
   364  	}
   365  	if err := task.broker.StopInstances(instances); err != nil {
   366  		logger.Errorf("broker failed to stop instances: %v", err)
   367  		return err
   368  	}
   369  	return nil
   370  }
   371  
   372  func (task *provisionerTask) startMachines(machines []*apiprovisioner.Machine) error {
   373  	for _, m := range machines {
   374  		if err := task.startMachine(m); err != nil {
   375  			return fmt.Errorf("cannot start machine %v: %v", m, err)
   376  		}
   377  	}
   378  	return nil
   379  }
   380  
   381  func (task *provisionerTask) startMachine(machine *apiprovisioner.Machine) error {
   382  	cons, err := machine.Constraints()
   383  	if err != nil {
   384  		return err
   385  	}
   386  	series, err := machine.Series()
   387  	if err != nil {
   388  		return err
   389  	}
   390  	possibleTools, err := task.possibleTools(series, cons)
   391  	if err != nil {
   392  		return err
   393  	}
   394  	machineConfig, err := task.machineConfig(machine)
   395  	if err != nil {
   396  		return err
   397  	}
   398  	inst, metadata, err := task.broker.StartInstance(cons, possibleTools, machineConfig)
   399  	if err != nil {
   400  		// Set the state to error, so the machine will be skipped next
   401  		// time until the error is resolved, but don't return an
   402  		// error; just keep going with the other machines.
   403  		logger.Errorf("cannot start instance for machine %q: %v", machine, err)
   404  		if err1 := machine.SetStatus(params.StatusError, err.Error()); err1 != nil {
   405  			// Something is wrong with this machine, better report it back.
   406  			logger.Errorf("cannot set error status for machine %q: %v", machine, err1)
   407  			return err1
   408  		}
   409  		return nil
   410  	}
   411  	nonce := machineConfig.MachineNonce
   412  	if err := machine.SetProvisioned(inst.Id(), nonce, metadata); err != nil {
   413  		logger.Errorf("cannot register instance for machine %v: %v", machine, err)
   414  		// The machine is started, but we can't record the mapping in
   415  		// state. It'll keep running while we fail out and restart,
   416  		// but will then be detected by findUnknownInstances and
   417  		// killed again.
   418  		//
   419  		// TODO(dimitern) Stop the instance right away here.
   420  		//
   421  		// Multiple instantiations of a given machine (with the same
   422  		// machine ID) cannot coexist, because findUnknownInstances is
   423  		// called before startMachines. However, if the first machine
   424  		// had started to do work before being replaced, we may
   425  		// encounter surprising problems.
   426  		return err
   427  	}
   428  	logger.Infof("started machine %s as instance %s with hardware %q", machine, inst.Id(), metadata)
   429  	return nil
   430  }
   431  
   432  func (task *provisionerTask) possibleTools(series string, cons constraints.Value) (coretools.List, error) {
   433  	if env, ok := task.broker.(environs.Environ); ok {
   434  		agentVersion, ok := env.Config().AgentVersion()
   435  		if !ok {
   436  			return nil, fmt.Errorf("no agent version set in environment configuration")
   437  		}
   438  		return tools.FindInstanceTools(env, agentVersion, series, cons.Arch)
   439  	}
   440  	if hasTools, ok := task.broker.(coretools.HasTools); ok {
   441  		return hasTools.Tools(), nil
   442  	}
   443  	panic(fmt.Errorf("broker of type %T does not provide any tools", task.broker))
   444  }
   445  
   446  func (task *provisionerTask) machineConfig(machine *apiprovisioner.Machine) (*cloudinit.MachineConfig, error) {
   447  	stateInfo, apiInfo, err := task.auth.SetupAuthentication(machine)
   448  	if err != nil {
   449  		logger.Errorf("failed to setup authentication: %v", err)
   450  		return nil, err
   451  	}
   452  	// Generated a nonce for the new instance, with the format: "machine-#:UUID".
   453  	// The first part is a badge, specifying the tag of the machine the provisioner
   454  	// is running on, while the second part is a random UUID.
   455  	uuid, err := utils.NewUUID()
   456  	if err != nil {
   457  		return nil, err
   458  	}
   459  	nonce := fmt.Sprintf("%s:%s", task.machineTag, uuid.String())
   460  	serie, err := machine.Series()
   461  	if err != nil {
   462  		return nil, err
   463  	}
   464  	machineConfig := environs.NewMachineConfig(machine.Id(), nonce, serie, stateInfo, apiInfo)
   465  	return machineConfig, nil
   466  }