launchpad.net/~rogpeppe/juju-core/500-errgo-fix@v0.0.0-20140213181702-000000002356/worker/provisioner/provisioner_task.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package provisioner
     5  
     6  import (
     7  	"fmt"
     8  
     9  	"launchpad.net/errgo/errors"
    10  	"launchpad.net/tomb"
    11  
    12  	"launchpad.net/juju-core/constraints"
    13  	"launchpad.net/juju-core/environs"
    14  	"launchpad.net/juju-core/environs/cloudinit"
    15  	"launchpad.net/juju-core/environs/tools"
    16  	"launchpad.net/juju-core/instance"
    17  	"launchpad.net/juju-core/names"
    18  	"launchpad.net/juju-core/state/api/params"
    19  	apiprovisioner "launchpad.net/juju-core/state/api/provisioner"
    20  	"launchpad.net/juju-core/state/watcher"
    21  	coretools "launchpad.net/juju-core/tools"
    22  	"launchpad.net/juju-core/utils"
    23  	"launchpad.net/juju-core/worker"
    24  )
    25  
    26  type ProvisionerTask interface {
    27  	worker.Worker
    28  	Stop() error
    29  	Dying() <-chan struct{}
    30  	Err() error
    31  
    32  	// SetSafeMode sets a flag to indicate whether the provisioner task
    33  	// runs in safe mode or not. In safe mode, any running instances
    34  	// which do no exist in state are allowed to keep running rather than
    35  	// being shut down.
    36  	SetSafeMode(safeMode bool)
    37  }
    38  
    39  type Watcher interface {
    40  	watcher.Errer
    41  	watcher.Stopper
    42  	Changes() <-chan []string
    43  }
    44  
    45  type MachineGetter interface {
    46  	Machine(tag string) (*apiprovisioner.Machine, error)
    47  }
    48  
    49  func NewProvisionerTask(
    50  	machineTag string,
    51  	safeMode bool,
    52  	machineGetter MachineGetter,
    53  	watcher Watcher,
    54  	broker environs.InstanceBroker,
    55  	auth environs.AuthenticationProvider,
    56  ) ProvisionerTask {
    57  	task := &provisionerTask{
    58  		machineTag:     machineTag,
    59  		machineGetter:  machineGetter,
    60  		machineWatcher: watcher,
    61  		broker:         broker,
    62  		auth:           auth,
    63  		safeMode:       safeMode,
    64  		safeModeChan:   make(chan bool, 1),
    65  		machines:       make(map[string]*apiprovisioner.Machine),
    66  	}
    67  	go func() {
    68  		defer task.tomb.Done()
    69  		task.tomb.Kill(task.loop())
    70  	}()
    71  	return task
    72  }
    73  
    74  type provisionerTask struct {
    75  	machineTag     string
    76  	machineGetter  MachineGetter
    77  	machineWatcher Watcher
    78  	broker         environs.InstanceBroker
    79  	tomb           tomb.Tomb
    80  	auth           environs.AuthenticationProvider
    81  
    82  	safeMode     bool
    83  	safeModeChan chan bool
    84  
    85  	// instance id -> instance
    86  	instances map[instance.Id]instance.Instance
    87  	// machine id -> machine
    88  	machines map[string]*apiprovisioner.Machine
    89  }
    90  
    91  // Kill implements worker.Worker.Kill.
    92  func (task *provisionerTask) Kill() {
    93  	task.tomb.Kill(nil)
    94  }
    95  
    96  // Wait implements worker.Worker.Wait.
    97  func (task *provisionerTask) Wait() error {
    98  	return task.tomb.Wait()
    99  }
   100  
   101  func (task *provisionerTask) Stop() error {
   102  	task.Kill()
   103  	return task.Wait()
   104  }
   105  
   106  func (task *provisionerTask) Dying() <-chan struct{} {
   107  	return task.tomb.Dying()
   108  }
   109  
   110  func (task *provisionerTask) Err() error {
   111  	return task.tomb.Err()
   112  }
   113  
   114  func (task *provisionerTask) loop() error {
   115  	logger.Infof("Starting up provisioner task %s", task.machineTag)
   116  	defer watcher.Stop(task.machineWatcher, &task.tomb)
   117  
   118  	// Don't allow the safe mode to change until we have
   119  	// read at least one set of changes, which will populate
   120  	// the task.machines map. Otherwise we will potentially
   121  	// see all legitimate instances as unknown.
   122  	var safeModeChan chan bool
   123  
   124  	// When the watcher is started, it will have the initial changes be all
   125  	// the machines that are relevant. Also, since this is available straight
   126  	// away, we know there will be some changes right off the bat.
   127  	for {
   128  		select {
   129  		case <-task.tomb.Dying():
   130  			logger.Infof("Shutting down provisioner task %s", task.machineTag)
   131  			return tomb.ErrDying
   132  		case ids, ok := <-task.machineWatcher.Changes():
   133  			if !ok {
   134  				return watcher.MustErr(task.machineWatcher)
   135  			}
   136  			// TODO(dfc; lp:1042717) fire process machines periodically to shut down unknown
   137  			// instances.
   138  			if err := task.processMachines(ids); err != nil {
   139  				return errors.Notef(err, "failed to process updated machines")
   140  			}
   141  			// We've seen a set of changes. Enable safe mode change.
   142  			safeModeChan = task.safeModeChan
   143  		case safeMode := <-safeModeChan:
   144  			if safeMode == task.safeMode {
   145  				break
   146  			}
   147  			logger.Infof("safe mode changed to %v", safeMode)
   148  			task.safeMode = safeMode
   149  			if !safeMode {
   150  				// Safe mode has been disabled, so process current machines
   151  				// so that unknown machines will be immediately dealt with.
   152  				if err := task.processMachines(nil); err != nil {
   153  					return errors.Notef(err, "failed to process machines after safe mode disabled")
   154  				}
   155  			}
   156  		}
   157  	}
   158  }
   159  
   160  // SetSafeMode implements ProvisionerTask.SetSafeMode().
   161  func (task *provisionerTask) SetSafeMode(safeMode bool) {
   162  	select {
   163  	case task.safeModeChan <- safeMode:
   164  	case <-task.Dying():
   165  	}
   166  }
   167  
   168  func (task *provisionerTask) processMachines(ids []string) error {
   169  	logger.Tracef("processMachines(%v)", ids)
   170  	// Populate the tasks maps of current instances and machines.
   171  	err := task.populateMachineMaps(ids)
   172  	if err != nil {
   173  		return mask(err)
   174  	}
   175  
   176  	// Find machines without an instance id or that are dead
   177  	pending, dead, err := task.pendingOrDead(ids)
   178  	if err != nil {
   179  		return mask(err)
   180  	}
   181  
   182  	// Stop all machines that are dead
   183  	stopping := task.instancesForMachines(dead)
   184  
   185  	// Find running instances that have no machines associated
   186  	unknown, err := task.findUnknownInstances(stopping)
   187  	if err != nil {
   188  		return mask(err)
   189  	}
   190  	if task.safeMode {
   191  		logger.Infof("running in safe mode, unknown instances not stopped %v", instanceIds(unknown))
   192  		unknown = nil
   193  	}
   194  	if len(stopping) > 0 {
   195  		logger.Infof("stopping known instances %v", stopping)
   196  	}
   197  	if len(unknown) > 0 {
   198  		logger.Infof("stopping unknown instances %v", instanceIds(unknown))
   199  	}
   200  	// It's important that we stop unknown instances before starting
   201  	// pending ones, because if we start an instance and then fail to
   202  	// set its InstanceId on the machine we don't want to start a new
   203  	// instance for the same machine ID.
   204  	if err := task.stopInstances(append(stopping, unknown...)); err != nil {
   205  		return mask(err)
   206  	}
   207  
   208  	// Remove any dead machines from state.
   209  	for _, machine := range dead {
   210  		logger.Infof("removing dead machine %q", machine)
   211  		if err := machine.Remove(); err != nil {
   212  			logger.Errorf("failed to remove dead machine %q", machine)
   213  		}
   214  		delete(task.machines, machine.Id())
   215  	}
   216  
   217  	// Start an instance for the pending ones
   218  	return task.startMachines(pending)
   219  }
   220  
   221  func instanceIds(instances []instance.Instance) []string {
   222  	ids := make([]string, 0, len(instances))
   223  	for _, inst := range instances {
   224  		ids = append(ids, string(inst.Id()))
   225  	}
   226  	return ids
   227  }
   228  
   229  func (task *provisionerTask) populateMachineMaps(ids []string) error {
   230  	task.instances = make(map[instance.Id]instance.Instance)
   231  
   232  	instances, err := task.broker.AllInstances()
   233  	if err != nil {
   234  		logger.Errorf("failed to get all instances from broker: %v", err)
   235  		return err
   236  	}
   237  	for _, i := range instances {
   238  		task.instances[i.Id()] = i
   239  	}
   240  
   241  	// Update the machines map with new data for each of the machines in the
   242  	// change list.
   243  	// TODO(thumper): update for API server later to get all machines in one go.
   244  	for _, id := range ids {
   245  		machineTag := names.MachineTag(id)
   246  		machine, err := task.machineGetter.Machine(machineTag)
   247  		switch {
   248  		case params.IsCodeNotFoundOrCodeUnauthorized(err):
   249  			logger.Debugf("machine %q not found in state", id)
   250  			delete(task.machines, id)
   251  		case err == nil:
   252  			task.machines[id] = machine
   253  		default:
   254  			logger.Errorf("failed to get machine: %v", err)
   255  		}
   256  	}
   257  	return nil
   258  }
   259  
   260  // pendingOrDead looks up machines with ids and returns those that do not
   261  // have an instance id assigned yet, and also those that are dead.
   262  func (task *provisionerTask) pendingOrDead(ids []string) (pending, dead []*apiprovisioner.Machine, err error) {
   263  	for _, id := range ids {
   264  		machine, found := task.machines[id]
   265  		if !found {
   266  			logger.Infof("machine %q not found", id)
   267  			continue
   268  		}
   269  		switch machine.Life() {
   270  		case params.Dying:
   271  			if _, err := machine.InstanceId(); err == nil {
   272  				continue
   273  			} else if !params.IsCodeNotProvisioned(err) {
   274  				logger.Errorf("failed to load machine %q instance id: %v", machine, err)
   275  				return nil, nil, err
   276  			}
   277  			logger.Infof("killing dying, unprovisioned machine %q", machine)
   278  			if err := machine.EnsureDead(); err != nil {
   279  				logger.Errorf("failed to ensure machine dead %q: %v", machine, err)
   280  				return nil, nil, err
   281  			}
   282  			fallthrough
   283  		case params.Dead:
   284  			dead = append(dead, machine)
   285  			continue
   286  		}
   287  		if instId, err := machine.InstanceId(); err != nil {
   288  			if !params.IsCodeNotProvisioned(err) {
   289  				logger.Errorf("failed to load machine %q instance id: %v", machine, err)
   290  				continue
   291  			}
   292  			status, _, err := machine.Status()
   293  			if err != nil {
   294  				logger.Infof("cannot get machine %q status: %v", machine, err)
   295  				continue
   296  			}
   297  			if status == params.StatusPending {
   298  				pending = append(pending, machine)
   299  				logger.Infof("found machine %q pending provisioning", machine)
   300  				continue
   301  			}
   302  		} else {
   303  			logger.Infof("machine %v already started as instance %q", machine, instId)
   304  		}
   305  	}
   306  	logger.Tracef("pending machines: %v", pending)
   307  	logger.Tracef("dead machines: %v", dead)
   308  	return
   309  }
   310  
   311  // findUnknownInstances finds instances which are not associated with a machine.
   312  func (task *provisionerTask) findUnknownInstances(stopping []instance.Instance) ([]instance.Instance, error) {
   313  	// Make a copy of the instances we know about.
   314  	instances := make(map[instance.Id]instance.Instance)
   315  	for k, v := range task.instances {
   316  		instances[k] = v
   317  	}
   318  
   319  	for _, m := range task.machines {
   320  		instId, err := m.InstanceId()
   321  		switch {
   322  		case err == nil:
   323  			delete(instances, instId)
   324  		case params.IsCodeNotProvisioned(err):
   325  		case params.IsCodeNotFoundOrCodeUnauthorized(err):
   326  		default:
   327  			return nil, err
   328  		}
   329  	}
   330  	// Now remove all those instances that we are stopping already as we
   331  	// know about those and don't want to include them in the unknown list.
   332  	for _, inst := range stopping {
   333  		delete(instances, inst.Id())
   334  	}
   335  	var unknown []instance.Instance
   336  	for _, inst := range instances {
   337  		unknown = append(unknown, inst)
   338  	}
   339  	return unknown, nil
   340  }
   341  
   342  // instancesForMachines returns a list of instance.Instance that represent
   343  // the list of machines running in the provider. Missing machines are
   344  // omitted from the list.
   345  func (task *provisionerTask) instancesForMachines(machines []*apiprovisioner.Machine) []instance.Instance {
   346  	var instances []instance.Instance
   347  	for _, machine := range machines {
   348  		instId, err := machine.InstanceId()
   349  		if err == nil {
   350  			instance, found := task.instances[instId]
   351  			// If the instance is not found we can't stop it.
   352  			if found {
   353  				instances = append(instances, instance)
   354  			}
   355  		}
   356  	}
   357  	return instances
   358  }
   359  
   360  func (task *provisionerTask) stopInstances(instances []instance.Instance) error {
   361  	// Although calling StopInstance with an empty slice should produce no change in the
   362  	// provider, environs like dummy do not consider this a noop.
   363  	if len(instances) == 0 {
   364  		return nil
   365  	}
   366  	if err := task.broker.StopInstances(instances); err != nil {
   367  		logger.Errorf("broker failed to stop instances: %v", err)
   368  		return err
   369  	}
   370  	return nil
   371  }
   372  
   373  func (task *provisionerTask) startMachines(machines []*apiprovisioner.Machine) error {
   374  	for _, m := range machines {
   375  		if err := task.startMachine(m); err != nil {
   376  			return errors.Notef(err, "cannot start machine %v", m)
   377  		}
   378  	}
   379  	return nil
   380  }
   381  
   382  func (task *provisionerTask) startMachine(machine *apiprovisioner.Machine) error {
   383  	cons, err := machine.Constraints()
   384  	if err != nil {
   385  		return mask(err)
   386  	}
   387  	series, err := machine.Series()
   388  	if err != nil {
   389  		return mask(err)
   390  	}
   391  	possibleTools, err := task.possibleTools(series, cons)
   392  	if err != nil {
   393  		return mask(err)
   394  	}
   395  	machineConfig, err := task.machineConfig(machine)
   396  	if err != nil {
   397  		return mask(err)
   398  	}
   399  	inst, metadata, err := task.broker.StartInstance(cons, possibleTools, machineConfig)
   400  	if err != nil {
   401  		// Set the state to error, so the machine will be skipped next
   402  		// time until the error is resolved, but don't return an
   403  		// error; just keep going with the other machines.
   404  		logger.Errorf("cannot start instance for machine %q: %v", machine, err)
   405  		if err1 := machine.SetStatus(params.StatusError, err.Error()); err1 != nil {
   406  			// Something is wrong with this machine, better report it back.
   407  			logger.Errorf("cannot set error status for machine %q: %v", machine, err1)
   408  			return err1
   409  		}
   410  		return nil
   411  	}
   412  	nonce := machineConfig.MachineNonce
   413  	if err := machine.SetProvisioned(inst.Id(), nonce, metadata); err != nil {
   414  		logger.Errorf("cannot register instance for machine %v: %v", machine, err)
   415  		// The machine is started, but we can't record the mapping in
   416  		// state. It'll keep running while we fail out and restart,
   417  		// but will then be detected by findUnknownInstances and
   418  		// killed again.
   419  		//
   420  		// TODO(dimitern) Stop the instance right away here.
   421  		//
   422  		// Multiple instantiations of a given machine (with the same
   423  		// machine ID) cannot coexist, because findUnknownInstances is
   424  		// called before startMachines. However, if the first machine
   425  		// had started to do work before being replaced, we may
   426  		// encounter surprising problems.
   427  		return err
   428  	}
   429  	logger.Infof("started machine %s as instance %s with hardware %q", machine, inst.Id(), metadata)
   430  	return nil
   431  }
   432  
   433  func (task *provisionerTask) possibleTools(series string, cons constraints.Value) (coretools.List, error) {
   434  	if env, ok := task.broker.(environs.Environ); ok {
   435  		agentVersion, ok := env.Config().AgentVersion()
   436  		if !ok {
   437  			return nil, errors.Newf("no agent version set in environment configuration")
   438  		}
   439  		return tools.FindInstanceTools(env, agentVersion, series, cons.Arch)
   440  	}
   441  	if hasTools, ok := task.broker.(coretools.HasTools); ok {
   442  		return hasTools.Tools(), nil
   443  	}
   444  	panic(errors.Newf("broker of type %T does not provide any tools", task.broker))
   445  }
   446  
   447  func (task *provisionerTask) machineConfig(machine *apiprovisioner.Machine) (*cloudinit.MachineConfig, error) {
   448  	stateInfo, apiInfo, err := task.auth.SetupAuthentication(machine)
   449  	if err != nil {
   450  		logger.Errorf("failed to setup authentication: %v", err)
   451  		return nil, err
   452  	}
   453  	// Generated a nonce for the new instance, with the format: "machine-#:UUID".
   454  	// The first part is a badge, specifying the tag of the machine the provisioner
   455  	// is running on, while the second part is a random UUID.
   456  	uuid, err := utils.NewUUID()
   457  	if err != nil {
   458  		return nil, mask(err)
   459  	}
   460  	nonce := fmt.Sprintf("%s:%s", task.machineTag, uuid.String())
   461  	machineConfig := environs.NewMachineConfig(machine.Id(), nonce, stateInfo, apiInfo)
   462  	return machineConfig, nil
   463  }