github.com/cloudbase/juju-core@v0.0.0-20140504232958-a7271ac7912f/cmd/jujud/machine.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package main
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"time"
    11  
    12  	"github.com/juju/loggo"
    13  	"launchpad.net/gnuflag"
    14  	"launchpad.net/tomb"
    15  
    16  	"launchpad.net/juju-core/agent"
    17  	"launchpad.net/juju-core/charm"
    18  	"launchpad.net/juju-core/juju/osenv"
    19  	"launchpad.net/juju-core/cmd"
    20  	"launchpad.net/juju-core/container/kvm"
    21  	"launchpad.net/juju-core/instance"
    22  	"launchpad.net/juju-core/names"
    23  	"launchpad.net/juju-core/provider"
    24  	"launchpad.net/juju-core/state"
    25  	"launchpad.net/juju-core/state/api"
    26  	apiagent "launchpad.net/juju-core/state/api/agent"
    27  	"launchpad.net/juju-core/state/api/params"
    28  	apiprovisioner "launchpad.net/juju-core/state/api/provisioner"
    29  	"launchpad.net/juju-core/state/apiserver"
    30  	"launchpad.net/juju-core/upgrades"
    31  	"launchpad.net/juju-core/upstart"
    32  	"launchpad.net/juju-core/version"
    33  	"launchpad.net/juju-core/worker"
    34  	"launchpad.net/juju-core/worker/cleaner"
    35  	"launchpad.net/juju-core/worker/instancepoller"
    36  	"launchpad.net/juju-core/worker/localstorage"
    37  	"launchpad.net/juju-core/worker/minunitsworker"
    38  	"launchpad.net/juju-core/worker/provisioner"
    39  	"launchpad.net/juju-core/worker/resumer"
    40  	"launchpad.net/juju-core/worker/terminationworker"
    41  )
    42  
    43  var logger = loggo.GetLogger("juju.cmd.jujud")
    44  
    45  var newRunner = func(isFatal func(error) bool, moreImportant func(e0, e1 error) bool) worker.Runner {
    46  	return worker.NewRunner(isFatal, moreImportant)
    47  }
    48  
    49  const bootstrapMachineId = "0"
    50  
    51  var retryDelay = 3 * time.Second
    52  
    53  var jujuRun = osenv.JujuRun
    54  
    55  // MachineAgent is a cmd.Command responsible for running a machine agent.
    56  type MachineAgent struct {
    57  	cmd.CommandBase
    58  	tomb            tomb.Tomb
    59  	Conf            AgentConf
    60  	MachineId       string
    61  	runner          worker.Runner
    62  	upgradeComplete chan struct{}
    63  	stateOpened     chan struct{}
    64  	st              *state.State
    65  }
    66  
    67  // Info returns usage information for the command.
    68  func (a *MachineAgent) Info() *cmd.Info {
    69  	return &cmd.Info{
    70  		Name:    "machine",
    71  		Purpose: "run a juju machine agent",
    72  	}
    73  }
    74  
    75  func (a *MachineAgent) SetFlags(f *gnuflag.FlagSet) {
    76  	a.Conf.addFlags(f)
    77  	f.StringVar(&a.MachineId, "machine-id", "", "id of the machine to run")
    78  }
    79  
    80  // Init initializes the command for running.
    81  func (a *MachineAgent) Init(args []string) error {
    82  	if !names.IsMachine(a.MachineId) {
    83  		return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer")
    84  	}
    85  	if err := a.Conf.checkArgs(args); err != nil {
    86  		return err
    87  	}
    88  	a.runner = newRunner(isFatal, moreImportant)
    89  	a.upgradeComplete = make(chan struct{})
    90  	a.stateOpened = make(chan struct{})
    91  	return nil
    92  }
    93  
    94  // Wait waits for the machine agent to finish.
    95  func (a *MachineAgent) Wait() error {
    96  	return a.tomb.Wait()
    97  }
    98  
    99  // Stop stops the machine agent.
   100  func (a *MachineAgent) Stop() error {
   101  	a.runner.Kill()
   102  	return a.tomb.Wait()
   103  }
   104  
   105  // Run runs a machine agent.
   106  func (a *MachineAgent) Run(_ *cmd.Context) error {
   107  	// Due to changes in the logging, and needing to care about old
   108  	// environments that have been upgraded, we need to explicitly remove the
   109  	// file writer if one has been added, otherwise we will get duplicate
   110  	// lines of all logging in the log file.
   111  	loggo.RemoveWriter("logfile")
   112  	defer a.tomb.Done()
   113  	logger.Infof("machine agent %v start (%s)", a.Tag(), version.Current)
   114  	if err := a.Conf.read(a.Tag()); err != nil {
   115  		return err
   116  	}
   117  	charm.CacheDir = filepath.Join(a.Conf.dataDir, "charmcache")
   118  	if err := a.initAgent(); err != nil {
   119  		return err
   120  	}
   121  
   122  	// ensureStateWorker ensures that there is a worker that
   123  	// connects to the state that runs within itself all the workers
   124  	// that need a state connection. Unless we're bootstrapping, we
   125  	// need to connect to the API server to find out if we need to
   126  	// call this, so we make the APIWorker call it when necessary if
   127  	// the machine requires it. Note that ensureStateWorker can be
   128  	// called many times - StartWorker does nothing if there is
   129  	// already a worker started with the given name.
   130  	ensureStateWorker := func() {
   131  		a.runner.StartWorker("state", a.StateWorker)
   132  	}
   133  	// We might be bootstrapping, and the API server is not
   134  	// running yet. If so, make sure we run a state worker instead.
   135  	if a.MachineId == bootstrapMachineId {
   136  		// TODO(rog) When we have HA, we only want to do this
   137  		// when we really are bootstrapping - once other
   138  		// instances of the API server have been started, we
   139  		// should follow the normal course of things and ignore
   140  		// the fact that this was once the bootstrap machine.
   141  		logger.Infof("Starting StateWorker for machine-0")
   142  		ensureStateWorker()
   143  	}
   144  	a.runner.StartWorker("api", func() (worker.Worker, error) {
   145  		return a.APIWorker(ensureStateWorker)
   146  	})
   147  	a.runner.StartWorker("termination", func() (worker.Worker, error) {
   148  		return terminationworker.NewWorker(), nil
   149  	})
   150  	err := a.runner.Wait()
   151  	if err == worker.ErrTerminateAgent {
   152  		err = a.uninstallAgent()
   153  	}
   154  	err = agentDone(err)
   155  	a.tomb.Kill(err)
   156  	return err
   157  }
   158  
   159  // setupContainerSupport determines what containers can be run on this machine and
   160  // initialises suitable infrastructure to support such containers.
   161  func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity) error {
   162  	var supportedContainers []instance.ContainerType
   163  	// We don't yet support nested lxc containers but anything else can run an LXC container.
   164  	if entity.ContainerType() != instance.LXC {
   165  		supportedContainers = append(supportedContainers, instance.LXC)
   166  	}
   167  	supportsKvm, err := kvm.IsKVMSupported()
   168  	if err != nil {
   169  		logger.Warningf("determining kvm support: %v\nno kvm containers possible", err)
   170  	}
   171  	if err == nil && supportsKvm {
   172  		supportedContainers = append(supportedContainers, instance.KVM)
   173  	}
   174  	return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers)
   175  }
   176  
   177  // updateSupportedContainers records in state that a machine can run the specified containers.
   178  // It starts a watcher and when a container of a given type is first added to the machine,
   179  // the watcher is killed, the machine is set up to be able to start containers of the given type,
   180  // and a suitable provisioner is started.
   181  func (a *MachineAgent) updateSupportedContainers(runner worker.Runner, st *api.State,
   182  	tag string, containers []instance.ContainerType) error {
   183  
   184  	var machine *apiprovisioner.Machine
   185  	var err error
   186  	pr := st.Provisioner()
   187  	if machine, err = pr.Machine(tag); err != nil {
   188  		return fmt.Errorf("%s is not in state: %v", tag, err)
   189  	}
   190  	if len(containers) == 0 {
   191  		if err := machine.SupportsNoContainers(); err != nil {
   192  			return fmt.Errorf("clearing supported containers for %s: %v", tag, err)
   193  		}
   194  		return nil
   195  	}
   196  	if err := machine.SetSupportedContainers(containers...); err != nil {
   197  		return fmt.Errorf("setting supported containers for %s: %v", tag, err)
   198  	}
   199  	// Start the watcher to fire when a container is first requested on the machine.
   200  	watcherName := fmt.Sprintf("%s-container-watcher", machine.Id())
   201  	handler := provisioner.NewContainerSetupHandler(runner, watcherName, containers, machine, pr, a.Conf.config)
   202  	a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) {
   203  		return worker.NewStringsWorker(handler), nil
   204  	})
   205  	return nil
   206  }
   207  
   208  // StateJobs returns a worker running all the workers that require
   209  // a *state.State connection.
   210  func (a *MachineAgent) StateWorker() (worker.Worker, error) {
   211  	agentConfig := a.Conf.config
   212  	st, entity, err := openState(agentConfig, a)
   213  	if err != nil {
   214  		return nil, err
   215  	}
   216  	a.st = st
   217  	close(a.stateOpened)
   218  	reportOpenedState(st)
   219  	m := entity.(*state.Machine)
   220  
   221  	runner := newRunner(connectionIsFatal(st), moreImportant)
   222  	// Take advantage of special knowledge here in that we will only ever want
   223  	// the storage provider on one machine, and that is the "bootstrap" node.
   224  	providerType := agentConfig.Value(agent.ProviderType)
   225  	if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId {
   226  		a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) {
   227  			// TODO(axw) 2013-09-24 bug #1229507
   228  			// Make another job to enable storage.
   229  			// There's nothing special about this.
   230  			return localstorage.NewWorker(agentConfig), nil
   231  		})
   232  	}
   233  	for _, job := range m.Jobs() {
   234  		switch job {
   235  		case state.JobHostUnits:
   236  			// Implemented in APIWorker.
   237  		case state.JobManageEnviron:
   238  			a.startWorkerAfterUpgrade(runner, "instancepoller", func() (worker.Worker, error) {
   239  				return instancepoller.NewWorker(st), nil
   240  			})
   241  			runner.StartWorker("apiserver", func() (worker.Worker, error) {
   242  				// If the configuration does not have the required information,
   243  				// it is currently not a recoverable error, so we kill the whole
   244  				// agent, potentially enabling human intervention to fix
   245  				// the agent's configuration file. In the future, we may retrieve
   246  				// the state server certificate and key from the state, and
   247  				// this should then change.
   248  				port, cert, key := a.Conf.config.APIServerDetails()
   249  				if len(cert) == 0 || len(key) == 0 {
   250  					return nil, &fatalError{"configuration does not have state server cert/key"}
   251  				}
   252  				dataDir := a.Conf.config.DataDir()
   253  				return apiserver.NewServer(st, fmt.Sprintf(":%d", port), cert, key, dataDir)
   254  			})
   255  			a.startWorkerAfterUpgrade(runner, "cleaner", func() (worker.Worker, error) {
   256  				return cleaner.NewCleaner(st), nil
   257  			})
   258  			a.startWorkerAfterUpgrade(runner, "resumer", func() (worker.Worker, error) {
   259  				// The action of resumer is so subtle that it is not tested,
   260  				// because we can't figure out how to do so without brutalising
   261  				// the transaction log.
   262  				return resumer.NewResumer(st), nil
   263  			})
   264  			a.startWorkerAfterUpgrade(runner, "minunitsworker", func() (worker.Worker, error) {
   265  				return minunitsworker.NewMinUnitsWorker(st), nil
   266  			})
   267  		case state.JobManageStateDeprecated:
   268  			// Legacy environments may set this, but we ignore it.
   269  		default:
   270  			logger.Warningf("ignoring unknown job %q", job)
   271  		}
   272  	}
   273  	return newCloseWorker(runner, st), nil
   274  }
   275  
   276  // startWorker starts a worker to run the specified child worker but only after waiting for upgrades to complete.
   277  func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) {
   278  	runner.StartWorker(name, func() (worker.Worker, error) {
   279  		return a.upgradeWaiterWorker(start), nil
   280  	})
   281  }
   282  
   283  // upgradeWaiterWorker runs the specified worker after upgrades have completed.
   284  func (a *MachineAgent) upgradeWaiterWorker(start func() (worker.Worker, error)) worker.Worker {
   285  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   286  		// wait for the upgrade to complete (or for us to be stopped)
   287  		select {
   288  		case <-stop:
   289  			return nil
   290  		case <-a.upgradeComplete:
   291  		}
   292  		w, err := start()
   293  		if err != nil {
   294  			return err
   295  		}
   296  		waitCh := make(chan error)
   297  		go func() {
   298  			waitCh <- w.Wait()
   299  		}()
   300  		select {
   301  		case err := <-waitCh:
   302  			return err
   303  		case <-stop:
   304  			w.Kill()
   305  		}
   306  		return <-waitCh
   307  	})
   308  }
   309  
   310  // upgradeWorker runs the required upgrade operations to upgrade to the current Juju version.
   311  func (a *MachineAgent) upgradeWorker(apiState *api.State, jobs []params.MachineJob) worker.Worker {
   312  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   313  		select {
   314  		case <-a.upgradeComplete:
   315  			// Our work is already done (we're probably being restarted
   316  			// because the API connection has gone down), so do nothing.
   317  			<-stop
   318  			return nil
   319  		default:
   320  		}
   321  		// If the machine agent is a state server, wait until state is opened.
   322  		var st *state.State
   323  		for _, job := range jobs {
   324  			if job == params.JobManageEnviron {
   325  				select {
   326  				case <-a.stateOpened:
   327  				}
   328  				st = a.st
   329  				break
   330  			}
   331  		}
   332  		err := a.runUpgrades(st, apiState, jobs)
   333  		if err != nil {
   334  			return err
   335  		}
   336  		logger.Infof("Upgrade to %v completed.", version.Current)
   337  		close(a.upgradeComplete)
   338  		<-stop
   339  		return nil
   340  	})
   341  }
   342  
   343  // runUpgrades runs the upgrade operations for each job type and updates the updatedToVersion on success.
   344  func (a *MachineAgent) runUpgrades(st *state.State, apiState *api.State, jobs []params.MachineJob) error {
   345  	agentConfig := a.Conf.config
   346  	from := version.Current
   347  	from.Number = agentConfig.UpgradedToVersion()
   348  	if from == version.Current {
   349  		logger.Infof("Upgrade to %v already completed.", version.Current)
   350  		return nil
   351  	}
   352  	context := upgrades.NewContext(agentConfig, apiState, st)
   353  	for _, job := range jobs {
   354  		var target upgrades.Target
   355  		switch job {
   356  		case params.JobManageEnviron:
   357  			target = upgrades.StateServer
   358  		case params.JobHostUnits:
   359  			target = upgrades.HostMachine
   360  		default:
   361  			continue
   362  		}
   363  		logger.Infof("Starting upgrade from %v to %v for %v", from, version.Current, target)
   364  		if err := upgrades.PerformUpgrade(from.Number, target, context); err != nil {
   365  			return fmt.Errorf("cannot perform upgrade from %v to %v for %v: %v", from, version.Current, target, err)
   366  		}
   367  	}
   368  	return a.Conf.config.WriteUpgradedToVersion(version.Current.Number)
   369  }
   370  
   371  func (a *MachineAgent) Entity(st *state.State) (AgentState, error) {
   372  	m, err := st.Machine(a.MachineId)
   373  	if err != nil {
   374  		return nil, err
   375  	}
   376  	// Check the machine nonce as provisioned matches the agent.Conf value.
   377  	if !m.CheckProvisioned(a.Conf.config.Nonce()) {
   378  		// The agent is running on a different machine to the one it
   379  		// should be according to state. It must stop immediately.
   380  		logger.Errorf("running machine %v agent on inappropriate instance", m)
   381  		return nil, worker.ErrTerminateAgent
   382  	}
   383  	return m, nil
   384  }
   385  
   386  func (a *MachineAgent) Tag() string {
   387  	return names.MachineTag(a.MachineId)
   388  }
   389  
   390  func (a *MachineAgent) uninstallAgent() error {
   391  	var errors []error
   392  	agentServiceName := a.Conf.config.Value(agent.AgentServiceName)
   393  	if agentServiceName == "" {
   394  		// For backwards compatibility, handle lack of AgentServiceName.
   395  		agentServiceName = os.Getenv("UPSTART_JOB")
   396  	}
   397  	if agentServiceName != "" {
   398  		if err := upstart.NewService(agentServiceName).Remove(); err != nil {
   399  			errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err))
   400  		}
   401  	}
   402  	// Remove the juju-run symlink.
   403  	if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) {
   404  		errors = append(errors, err)
   405  	}
   406  	// The machine agent may terminate without knowing its jobs,
   407  	// for example if the machine's entry in state was removed.
   408  	// Thus, we do not rely on jobs here, and instead just check
   409  	// if the upstart config exists.
   410  	mongoServiceName := a.Conf.config.Value(agent.MongoServiceName)
   411  	if mongoServiceName != "" {
   412  		if err := upstart.NewService(mongoServiceName).StopAndRemove(); err != nil {
   413  			errors = append(errors, fmt.Errorf("cannot stop/remove service %q: %v", mongoServiceName, err))
   414  		}
   415  	}
   416  	if err := os.RemoveAll(a.Conf.dataDir); err != nil {
   417  		errors = append(errors, err)
   418  	}
   419  	if len(errors) == 0 {
   420  		return nil
   421  	}
   422  	return fmt.Errorf("uninstall failed: %v", errors)
   423  }
   424  
   425  // Below pieces are used for testing,to give us access to the *State opened
   426  // by the agent, and allow us to trigger syncs without waiting 5s for them
   427  // to happen automatically.
   428  
   429  var stateReporter chan<- *state.State
   430  
   431  func reportOpenedState(st *state.State) {
   432  	select {
   433  	case stateReporter <- st:
   434  	default:
   435  	}
   436  }
   437  
   438  func sendOpenedStates(dst chan<- *state.State) (undo func()) {
   439  	var original chan<- *state.State
   440  	original, stateReporter = stateReporter, dst
   441  	return func() { stateReporter = original }
   442  }
   443  
   444  var apiReporter chan<- *api.State
   445  
   446  func reportOpenedAPI(st *api.State) {
   447  	select {
   448  	case apiReporter <- st:
   449  	default:
   450  	}
   451  }
   452  func sendOpenedAPIs(dst chan<- *api.State) (undo func()) {
   453  	var original chan<- *api.State
   454  	original, apiReporter = apiReporter, dst
   455  	return func() { apiReporter = original }
   456  }