
     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     4  package main
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"time"
    12  	""
    13  	""
    14  	""
    16  	""
    17  	""
    18  	""
    19  	""
    20  	""
    21  	""
    22  	""
    23  	""
    24  	""
    25  	""
    26  	apiagent ""
    27  	""
    28  	apiprovisioner ""
    29  	""
    30  	""
    31  	""
    32  	""
    33  	""
    34  	""
    35  	""
    36  	""
    37  	""
    38  	""
    39  	""
    40  	""
    41  )
    43  var logger = loggo.GetLogger("juju.cmd.jujud")
    45  var newRunner = func(isFatal func(error) bool, moreImportant func(e0, e1 error) bool) worker.Runner {
    46  	return worker.NewRunner(isFatal, moreImportant)
    47  }
    49  const bootstrapMachineId = "0"
    51  var retryDelay = 3 * time.Second
    53  var jujuRun = osenv.JujuRun
    55  // MachineAgent is a cmd.Command responsible for running a machine agent.
    56  type MachineAgent struct {
    57  	cmd.CommandBase
    58  	tomb            tomb.Tomb
    59  	Conf            AgentConf
    60  	MachineId       string
    61  	runner          worker.Runner
    62  	upgradeComplete chan struct{}
    63  	stateOpened     chan struct{}
    64  	st              *state.State
    65  }
    67  // Info returns usage information for the command.
    68  func (a *MachineAgent) Info() *cmd.Info {
    69  	return &cmd.Info{
    70  		Name:    "machine",
    71  		Purpose: "run a juju machine agent",
    72  	}
    73  }
    75  func (a *MachineAgent) SetFlags(f *gnuflag.FlagSet) {
    76  	a.Conf.addFlags(f)
    77  	f.StringVar(&a.MachineId, "machine-id", "", "id of the machine to run")
    78  }
    80  // Init initializes the command for running.
    81  func (a *MachineAgent) Init(args []string) error {
    82  	if !names.IsMachine(a.MachineId) {
    83  		return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer")
    84  	}
    85  	if err := a.Conf.checkArgs(args); err != nil {
    86  		return err
    87  	}
    88  	a.runner = newRunner(isFatal, moreImportant)
    89  	a.upgradeComplete = make(chan struct{})
    90  	a.stateOpened = make(chan struct{})
    91  	return nil
    92  }
    94  // Wait waits for the machine agent to finish.
    95  func (a *MachineAgent) Wait() error {
    96  	return a.tomb.Wait()
    97  }
    99  // Stop stops the machine agent.
   100  func (a *MachineAgent) Stop() error {
   101  	a.runner.Kill()
   102  	return a.tomb.Wait()
   103  }
   105  // Run runs a machine agent.
   106  func (a *MachineAgent) Run(_ *cmd.Context) error {
   107  	// Due to changes in the logging, and needing to care about old
   108  	// environments that have been upgraded, we need to explicitly remove the
   109  	// file writer if one has been added, otherwise we will get duplicate
   110  	// lines of all logging in the log file.
   111  	loggo.RemoveWriter("logfile")
   112  	defer a.tomb.Done()
   113  	logger.Infof("machine agent %v start (%s)", a.Tag(), version.Current)
   114  	if err :=; err != nil {
   115  		return err
   116  	}
   117  	charm.CacheDir = filepath.Join(a.Conf.dataDir, "charmcache")
   118  	if err := a.initAgent(); err != nil {
   119  		return err
   120  	}
   122  	// ensureStateWorker ensures that there is a worker that
   123  	// connects to the state that runs within itself all the workers
   124  	// that need a state connection. Unless we're bootstrapping, we
   125  	// need to connect to the API server to find out if we need to
   126  	// call this, so we make the APIWorker call it when necessary if
   127  	// the machine requires it. Note that ensureStateWorker can be
   128  	// called many times - StartWorker does nothing if there is
   129  	// already a worker started with the given name.
   130  	ensureStateWorker := func() {
   131  		a.runner.StartWorker("state", a.StateWorker)
   132  	}
   133  	// We might be bootstrapping, and the API server is not
   134  	// running yet. If so, make sure we run a state worker instead.
   135  	if a.MachineId == bootstrapMachineId {
   136  		// TODO(rog) When we have HA, we only want to do this
   137  		// when we really are bootstrapping - once other
   138  		// instances of the API server have been started, we
   139  		// should follow the normal course of things and ignore
   140  		// the fact that this was once the bootstrap machine.
   141  		logger.Infof("Starting StateWorker for machine-0")
   142  		ensureStateWorker()
   143  	}
   144  	a.runner.StartWorker("api", func() (worker.Worker, error) {
   145  		return a.APIWorker(ensureStateWorker)
   146  	})
   147  	a.runner.StartWorker("termination", func() (worker.Worker, error) {
   148  		return terminationworker.NewWorker(), nil
   149  	})
   150  	err := a.runner.Wait()
   151  	if err == worker.ErrTerminateAgent {
   152  		err = a.uninstallAgent()
   153  	}
   154  	err = agentDone(err)
   155  	a.tomb.Kill(err)
   156  	return err
   157  }
   159  // setupContainerSupport determines what containers can be run on this machine and
   160  // initialises suitable infrastructure to support such containers.
   161  func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity) error {
   162  	var supportedContainers []instance.ContainerType
   163  	// We don't yet support nested lxc containers but anything else can run an LXC container.
   164  	if entity.ContainerType() != instance.LXC {
   165  		supportedContainers = append(supportedContainers, instance.LXC)
   166  	}
   167  	supportsKvm, err := kvm.IsKVMSupported()
   168  	if err != nil {
   169  		logger.Warningf("determining kvm support: %v\nno kvm containers possible", err)
   170  	}
   171  	if err == nil && supportsKvm {
   172  		supportedContainers = append(supportedContainers, instance.KVM)
   173  	}
   174  	return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers)
   175  }
   177  // updateSupportedContainers records in state that a machine can run the specified containers.
   178  // It starts a watcher and when a container of a given type is first added to the machine,
   179  // the watcher is killed, the machine is set up to be able to start containers of the given type,
   180  // and a suitable provisioner is started.
   181  func (a *MachineAgent) updateSupportedContainers(runner worker.Runner, st *api.State,
   182  	tag string, containers []instance.ContainerType) error {
   184  	var machine *apiprovisioner.Machine
   185  	var err error
   186  	pr := st.Provisioner()
   187  	if machine, err = pr.Machine(tag); err != nil {
   188  		return fmt.Errorf("%s is not in state: %v", tag, err)
   189  	}
   190  	if len(containers) == 0 {
   191  		if err := machine.SupportsNoContainers(); err != nil {
   192  			return fmt.Errorf("clearing supported containers for %s: %v", tag, err)
   193  		}
   194  		return nil
   195  	}
   196  	if err := machine.SetSupportedContainers(containers...); err != nil {
   197  		return fmt.Errorf("setting supported containers for %s: %v", tag, err)
   198  	}
   199  	// Start the watcher to fire when a container is first requested on the machine.
   200  	watcherName := fmt.Sprintf("%s-container-watcher", machine.Id())
   201  	handler := provisioner.NewContainerSetupHandler(runner, watcherName, containers, machine, pr, a.Conf.config)
   202  	a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) {
   203  		return worker.NewStringsWorker(handler), nil
   204  	})
   205  	return nil
   206  }
   208  // StateJobs returns a worker running all the workers that require
   209  // a *state.State connection.
   210  func (a *MachineAgent) StateWorker() (worker.Worker, error) {
   211  	agentConfig := a.Conf.config
   212  	st, entity, err := openState(agentConfig, a)
   213  	if err != nil {
   214  		return nil, err
   215  	}
   216 = st
   217  	close(a.stateOpened)
   218  	reportOpenedState(st)
   219  	m := entity.(*state.Machine)
   221  	runner := newRunner(connectionIsFatal(st), moreImportant)
   222  	// Take advantage of special knowledge here in that we will only ever want
   223  	// the storage provider on one machine, and that is the "bootstrap" node.
   224  	providerType := agentConfig.Value(agent.ProviderType)
   225  	if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId {
   226  		a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) {
   227  			// TODO(axw) 2013-09-24 bug #1229507
   228  			// Make another job to enable storage.
   229  			// There's nothing special about this.
   230  			return localstorage.NewWorker(agentConfig), nil
   231  		})
   232  	}
   233  	for _, job := range m.Jobs() {
   234  		switch job {
   235  		case state.JobHostUnits:
   236  			// Implemented in APIWorker.
   237  		case state.JobManageEnviron:
   238  			a.startWorkerAfterUpgrade(runner, "instancepoller", func() (worker.Worker, error) {
   239  				return instancepoller.NewWorker(st), nil
   240  			})
   241  			runner.StartWorker("apiserver", func() (worker.Worker, error) {
   242  				// If the configuration does not have the required information,
   243  				// it is currently not a recoverable error, so we kill the whole
   244  				// agent, potentially enabling human intervention to fix
   245  				// the agent's configuration file. In the future, we may retrieve
   246  				// the state server certificate and key from the state, and
   247  				// this should then change.
   248  				port, cert, key := a.Conf.config.APIServerDetails()
   249  				if len(cert) == 0 || len(key) == 0 {
   250  					return nil, &fatalError{"configuration does not have state server cert/key"}
   251  				}
   252  				dataDir := a.Conf.config.DataDir()
   253  				return apiserver.NewServer(st, fmt.Sprintf(":%d", port), cert, key, dataDir)
   254  			})
   255  			a.startWorkerAfterUpgrade(runner, "cleaner", func() (worker.Worker, error) {
   256  				return cleaner.NewCleaner(st), nil
   257  			})
   258  			a.startWorkerAfterUpgrade(runner, "resumer", func() (worker.Worker, error) {
   259  				// The action of resumer is so subtle that it is not tested,
   260  				// because we can't figure out how to do so without brutalising
   261  				// the transaction log.
   262  				return resumer.NewResumer(st), nil
   263  			})
   264  			a.startWorkerAfterUpgrade(runner, "minunitsworker", func() (worker.Worker, error) {
   265  				return minunitsworker.NewMinUnitsWorker(st), nil
   266  			})
   267  		case state.JobManageStateDeprecated:
   268  			// Legacy environments may set this, but we ignore it.
   269  		default:
   270  			logger.Warningf("ignoring unknown job %q", job)
   271  		}
   272  	}
   273  	return newCloseWorker(runner, st), nil
   274  }
   276  // startWorker starts a worker to run the specified child worker but only after waiting for upgrades to complete.
   277  func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) {
   278  	runner.StartWorker(name, func() (worker.Worker, error) {
   279  		return a.upgradeWaiterWorker(start), nil
   280  	})
   281  }
   283  // upgradeWaiterWorker runs the specified worker after upgrades have completed.
   284  func (a *MachineAgent) upgradeWaiterWorker(start func() (worker.Worker, error)) worker.Worker {
   285  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   286  		// wait for the upgrade to complete (or for us to be stopped)
   287  		select {
   288  		case <-stop:
   289  			return nil
   290  		case <-a.upgradeComplete:
   291  		}
   292  		w, err := start()
   293  		if err != nil {
   294  			return err
   295  		}
   296  		waitCh := make(chan error)
   297  		go func() {
   298  			waitCh <- w.Wait()
   299  		}()
   300  		select {
   301  		case err := <-waitCh:
   302  			return err
   303  		case <-stop:
   304  			w.Kill()
   305  		}
   306  		return <-waitCh
   307  	})
   308  }
   310  // upgradeWorker runs the required upgrade operations to upgrade to the current Juju version.
   311  func (a *MachineAgent) upgradeWorker(apiState *api.State, jobs []params.MachineJob) worker.Worker {
   312  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   313  		select {
   314  		case <-a.upgradeComplete:
   315  			// Our work is already done (we're probably being restarted
   316  			// because the API connection has gone down), so do nothing.
   317  			<-stop
   318  			return nil
   319  		default:
   320  		}
   321  		// If the machine agent is a state server, wait until state is opened.
   322  		var st *state.State
   323  		for _, job := range jobs {
   324  			if job == params.JobManageEnviron {
   325  				select {
   326  				case <-a.stateOpened:
   327  				}
   328  				st =
   329  				break
   330  			}
   331  		}
   332  		err := a.runUpgrades(st, apiState, jobs)
   333  		if err != nil {
   334  			return err
   335  		}
   336  		logger.Infof("Upgrade to %v completed.", version.Current)
   337  		close(a.upgradeComplete)
   338  		<-stop
   339  		return nil
   340  	})
   341  }
   343  // runUpgrades runs the upgrade operations for each job type and updates the updatedToVersion on success.
   344  func (a *MachineAgent) runUpgrades(st *state.State, apiState *api.State, jobs []params.MachineJob) error {
   345  	agentConfig := a.Conf.config
   346  	from := version.Current
   347  	from.Number = agentConfig.UpgradedToVersion()
   348  	if from == version.Current {
   349  		logger.Infof("Upgrade to %v already completed.", version.Current)
   350  		return nil
   351  	}
   352  	context := upgrades.NewContext(agentConfig, apiState, st)
   353  	for _, job := range jobs {
   354  		var target upgrades.Target
   355  		switch job {
   356  		case params.JobManageEnviron:
   357  			target = upgrades.StateServer
   358  		case params.JobHostUnits:
   359  			target = upgrades.HostMachine
   360  		default:
   361  			continue
   362  		}
   363  		logger.Infof("Starting upgrade from %v to %v for %v", from, version.Current, target)
   364  		if err := upgrades.PerformUpgrade(from.Number, target, context); err != nil {
   365  			return fmt.Errorf("cannot perform upgrade from %v to %v for %v: %v", from, version.Current, target, err)
   366  		}
   367  	}
   368  	return a.Conf.config.WriteUpgradedToVersion(version.Current.Number)
   369  }
   371  func (a *MachineAgent) Entity(st *state.State) (AgentState, error) {
   372  	m, err := st.Machine(a.MachineId)
   373  	if err != nil {
   374  		return nil, err
   375  	}
   376  	// Check the machine nonce as provisioned matches the agent.Conf value.
   377  	if !m.CheckProvisioned(a.Conf.config.Nonce()) {
   378  		// The agent is running on a different machine to the one it
   379  		// should be according to state. It must stop immediately.
   380  		logger.Errorf("running machine %v agent on inappropriate instance", m)
   381  		return nil, worker.ErrTerminateAgent
   382  	}
   383  	return m, nil
   384  }
   386  func (a *MachineAgent) Tag() string {
   387  	return names.MachineTag(a.MachineId)
   388  }
   390  func (a *MachineAgent) uninstallAgent() error {
   391  	var errors []error
   392  	agentServiceName := a.Conf.config.Value(agent.AgentServiceName)
   393  	if agentServiceName == "" {
   394  		// For backwards compatibility, handle lack of AgentServiceName.
   395  		agentServiceName = os.Getenv("UPSTART_JOB")
   396  	}
   397  	if agentServiceName != "" {
   398  		if err := upstart.NewService(agentServiceName).Remove(); err != nil {
   399  			errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err))
   400  		}
   401  	}
   402  	// Remove the juju-run symlink.
   403  	if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) {
   404  		errors = append(errors, err)
   405  	}
   406  	// The machine agent may terminate without knowing its jobs,
   407  	// for example if the machine's entry in state was removed.
   408  	// Thus, we do not rely on jobs here, and instead just check
   409  	// if the upstart config exists.
   410  	mongoServiceName := a.Conf.config.Value(agent.MongoServiceName)
   411  	if mongoServiceName != "" {
   412  		if err := upstart.NewService(mongoServiceName).StopAndRemove(); err != nil {
   413  			errors = append(errors, fmt.Errorf("cannot stop/remove service %q: %v", mongoServiceName, err))
   414  		}
   415  	}
   416  	if err := os.RemoveAll(a.Conf.dataDir); err != nil {
   417  		errors = append(errors, err)
   418  	}
   419  	if len(errors) == 0 {
   420  		return nil
   421  	}
   422  	return fmt.Errorf("uninstall failed: %v", errors)
   423  }
   425  // Below pieces are used for testing,to give us access to the *State opened
   426  // by the agent, and allow us to trigger syncs without waiting 5s for them
   427  // to happen automatically.
   429  var stateReporter chan<- *state.State
   431  func reportOpenedState(st *state.State) {
   432  	select {
   433  	case stateReporter <- st:
   434  	default:
   435  	}
   436  }
   438  func sendOpenedStates(dst chan<- *state.State) (undo func()) {
   439  	var original chan<- *state.State
   440  	original, stateReporter = stateReporter, dst
   441  	return func() { stateReporter = original }
   442  }
   444  var apiReporter chan<- *api.State
   446  func reportOpenedAPI(st *api.State) {
   447  	select {
   448  	case apiReporter <- st:
   449  	default:
   450  	}
   451  }
   452  func sendOpenedAPIs(dst chan<- *api.State) (undo func()) {
   453  	var original chan<- *api.State
   454  	original, apiReporter = apiReporter, dst
   455  	return func() { apiReporter = original }
   456  }