launchpad.net/~rogpeppe/juju-core/500-errgo-fix@v0.0.0-20140213181702-000000002356/cmd/jujud/machine.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package main
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"time"
    11  
    12  	"github.com/loggo/loggo"
    13  	"launchpad.net/errgo/errors"
    14  	"launchpad.net/gnuflag"
    15  	"launchpad.net/tomb"
    16  
    17  	"launchpad.net/juju-core/agent"
    18  	"launchpad.net/juju-core/charm"
    19  	"launchpad.net/juju-core/cmd"
    20  	"launchpad.net/juju-core/container/kvm"
    21  	"launchpad.net/juju-core/instance"
    22  	"launchpad.net/juju-core/log/syslog"
    23  	"launchpad.net/juju-core/names"
    24  	"launchpad.net/juju-core/provider"
    25  	"launchpad.net/juju-core/state"
    26  	"launchpad.net/juju-core/state/api"
    27  	apiagent "launchpad.net/juju-core/state/api/agent"
    28  	"launchpad.net/juju-core/state/api/params"
    29  	apiprovisioner "launchpad.net/juju-core/state/api/provisioner"
    30  	"launchpad.net/juju-core/state/apiserver"
    31  	"launchpad.net/juju-core/upstart"
    32  	"launchpad.net/juju-core/worker"
    33  	"launchpad.net/juju-core/worker/authenticationworker"
    34  	"launchpad.net/juju-core/worker/charmrevisionworker"
    35  	"launchpad.net/juju-core/worker/cleaner"
    36  	"launchpad.net/juju-core/worker/deployer"
    37  	"launchpad.net/juju-core/worker/firewaller"
    38  	"launchpad.net/juju-core/worker/instancepoller"
    39  	"launchpad.net/juju-core/worker/localstorage"
    40  	workerlogger "launchpad.net/juju-core/worker/logger"
    41  	"launchpad.net/juju-core/worker/machineenvironmentworker"
    42  	"launchpad.net/juju-core/worker/machiner"
    43  	"launchpad.net/juju-core/worker/minunitsworker"
    44  	"launchpad.net/juju-core/worker/provisioner"
    45  	"launchpad.net/juju-core/worker/resumer"
    46  	"launchpad.net/juju-core/worker/terminationworker"
    47  	"launchpad.net/juju-core/worker/upgrader"
    48  )
    49  
    50  var logger = loggo.GetLogger("juju.cmd.jujud")
    51  
    52  var mask = errors.Mask
    53  
    54  var newRunner = func(isFatal func(error) bool, moreImportant func(e0, e1 error) bool) worker.Runner {
    55  	return worker.NewRunner(isFatal, moreImportant)
    56  }
    57  
    58  const bootstrapMachineId = "0"
    59  
    60  var retryDelay = 3 * time.Second
    61  
    62  var jujuRun = "/usr/local/bin/juju-run"
    63  
    64  // MachineAgent is a cmd.Command responsible for running a machine agent.
    65  type MachineAgent struct {
    66  	cmd.CommandBase
    67  	tomb      tomb.Tomb
    68  	Conf      AgentConf
    69  	MachineId string
    70  	runner    worker.Runner
    71  }
    72  
    73  // Info returns usage information for the command.
    74  func (a *MachineAgent) Info() *cmd.Info {
    75  	return &cmd.Info{
    76  		Name:    "machine",
    77  		Purpose: "run a juju machine agent",
    78  	}
    79  }
    80  
    81  func (a *MachineAgent) SetFlags(f *gnuflag.FlagSet) {
    82  	a.Conf.addFlags(f)
    83  	f.StringVar(&a.MachineId, "machine-id", "", "id of the machine to run")
    84  }
    85  
    86  // Init initializes the command for running.
    87  func (a *MachineAgent) Init(args []string) error {
    88  	if !names.IsMachine(a.MachineId) {
    89  		return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer")
    90  	}
    91  	if err := a.Conf.checkArgs(args); err != nil {
    92  		return err
    93  	}
    94  	a.runner = newRunner(isFatal, moreImportant)
    95  	return nil
    96  }
    97  
    98  // Wait waits for the machine agent to finish.
    99  func (a *MachineAgent) Wait() error {
   100  	return a.tomb.Wait()
   101  }
   102  
   103  // Stop stops the machine agent.
   104  func (a *MachineAgent) Stop() error {
   105  	a.runner.Kill()
   106  	return a.tomb.Wait()
   107  }
   108  
   109  // Run runs a machine agent.
   110  func (a *MachineAgent) Run(_ *cmd.Context) error {
   111  	// Due to changes in the logging, and needing to care about old
   112  	// environments that have been upgraded, we need to explicitly remove the
   113  	// file writer if one has been added, otherwise we will get duplicate
   114  	// lines of all logging in the log file.
   115  	loggo.RemoveWriter("logfile")
   116  	defer a.tomb.Done()
   117  	logger.Infof("machine agent %v start", a.Tag())
   118  	if err := a.Conf.read(a.Tag()); err != nil {
   119  		return err
   120  	}
   121  	charm.CacheDir = filepath.Join(a.Conf.dataDir, "charmcache")
   122  	if err := a.initAgent(); err != nil {
   123  		return err
   124  	}
   125  
   126  	// ensureStateWorker ensures that there is a worker that
   127  	// connects to the state that runs within itself all the workers
   128  	// that need a state connection. Unless we're bootstrapping, we
   129  	// need to connect to the API server to find out if we need to
   130  	// call this, so we make the APIWorker call it when necessary if
   131  	// the machine requires it. Note that ensureStateWorker can be
   132  	// called many times - StartWorker does nothing if there is
   133  	// already a worker started with the given name.
   134  	ensureStateWorker := func() {
   135  		a.runner.StartWorker("state", a.StateWorker)
   136  	}
   137  	// We might be bootstrapping, and the API server is not
   138  	// running yet. If so, make sure we run a state worker instead.
   139  	if a.MachineId == bootstrapMachineId {
   140  		// TODO(rog) When we have HA, we only want to do this
   141  		// when we really are bootstrapping - once other
   142  		// instances of the API server have been started, we
   143  		// should follow the normal course of things and ignore
   144  		// the fact that this was once the bootstrap machine.
   145  		logger.Infof("Starting StateWorker for machine-0")
   146  		ensureStateWorker()
   147  	}
   148  	a.runner.StartWorker("api", func() (worker.Worker, error) {
   149  		return a.APIWorker(ensureStateWorker)
   150  	})
   151  	a.runner.StartWorker("termination", func() (worker.Worker, error) {
   152  		return terminationworker.NewWorker(), nil
   153  	})
   154  	err := a.runner.Wait()
   155  	if err == worker.ErrTerminateAgent {
   156  		err = a.uninstallAgent()
   157  	}
   158  	err = agentDone(err)
   159  	a.tomb.Kill(err)
   160  	return err
   161  }
   162  
   163  // APIWorker returns a Worker that connects to the API and starts any
   164  // workers that need an API connection.
   165  //
   166  // If a state worker is necessary, APIWorker calls ensureStateWorker.
   167  func (a *MachineAgent) APIWorker(ensureStateWorker func()) (worker.Worker, error) {
   168  	agentConfig := a.Conf.config
   169  	st, entity, err := openAPIState(agentConfig, a)
   170  	if err != nil {
   171  		return nil, err
   172  	}
   173  	reportOpenedAPI(st)
   174  	for _, job := range entity.Jobs() {
   175  		if job.NeedsState() {
   176  			ensureStateWorker()
   177  			break
   178  		}
   179  	}
   180  	runner := newRunner(connectionIsFatal(st), moreImportant)
   181  	runner.StartWorker("machiner", func() (worker.Worker, error) {
   182  		return machiner.NewMachiner(st.Machiner(), agentConfig), nil
   183  	})
   184  	runner.StartWorker("upgrader", func() (worker.Worker, error) {
   185  		return upgrader.NewUpgrader(st.Upgrader(), agentConfig), nil
   186  	})
   187  	runner.StartWorker("logger", func() (worker.Worker, error) {
   188  		return workerlogger.NewLogger(st.Logger(), agentConfig), nil
   189  	})
   190  	runner.StartWorker("machineenvironmentworker", func() (worker.Worker, error) {
   191  		return machineenvironmentworker.NewMachineEnvironmentWorker(st.Environment(), agentConfig), nil
   192  	})
   193  
   194  	// If not a local provider bootstrap machine, start the worker to manage SSH keys.
   195  	providerType := agentConfig.Value(agent.ProviderType)
   196  	if providerType != provider.Local || a.MachineId != bootstrapMachineId {
   197  		runner.StartWorker("authenticationworker", func() (worker.Worker, error) {
   198  			return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil
   199  		})
   200  	}
   201  
   202  	// Perform the operations needed to set up hosting for containers.
   203  	if err := a.setupContainerSupport(runner, st, entity); err != nil {
   204  		return nil, fmt.Errorf("setting up container support: %v", err)
   205  	}
   206  	for _, job := range entity.Jobs() {
   207  		switch job {
   208  		case params.JobHostUnits:
   209  			runner.StartWorker("deployer", func() (worker.Worker, error) {
   210  				apiDeployer := st.Deployer()
   211  				context := newDeployContext(apiDeployer, agentConfig)
   212  				return deployer.NewDeployer(apiDeployer, context), nil
   213  			})
   214  		case params.JobManageEnviron:
   215  			runner.StartWorker("environ-provisioner", func() (worker.Worker, error) {
   216  				return provisioner.NewEnvironProvisioner(st.Provisioner(), agentConfig), nil
   217  			})
   218  			// TODO(axw) 2013-09-24 bug #1229506
   219  			// Make another job to enable the firewaller. Not all environments
   220  			// are capable of managing ports centrally.
   221  			runner.StartWorker("firewaller", func() (worker.Worker, error) {
   222  				return firewaller.NewFirewaller(st.Firewaller())
   223  			})
   224  			runner.StartWorker("charm-revision-updater", func() (worker.Worker, error) {
   225  				return charmrevisionworker.NewRevisionUpdateWorker(st.CharmRevisionUpdater()), nil
   226  			})
   227  		case params.JobManageState:
   228  			// Legacy environments may set this, but we ignore it.
   229  		default:
   230  			// TODO(dimitern): Once all workers moved over to using
   231  			// the API, report "unknown job type" here.
   232  		}
   233  	}
   234  	return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker.
   235  }
   236  
   237  // setupContainerSupport determines what containers can be run on this machine and
   238  // initialises suitable infrastructure to support such containers.
   239  func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity) error {
   240  	var supportedContainers []instance.ContainerType
   241  	// We don't yet support nested lxc containers but anything else can run an LXC container.
   242  	if entity.ContainerType() != instance.LXC {
   243  		supportedContainers = append(supportedContainers, instance.LXC)
   244  	}
   245  	supportsKvm, err := kvm.IsKVMSupported()
   246  	if err != nil {
   247  		logger.Warningf("determining kvm support: %v\nno kvm containers possible", err)
   248  	}
   249  	if err == nil && supportsKvm {
   250  		supportedContainers = append(supportedContainers, instance.KVM)
   251  	}
   252  	return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers)
   253  }
   254  
   255  // updateSupportedContainers records in state that a machine can run the specified containers.
   256  // It starts a watcher and when a container of a given type is first added to the machine,
   257  // the watcher is killed, the machine is set up to be able to start containers of the given type,
   258  // and a suitable provisioner is started.
   259  func (a *MachineAgent) updateSupportedContainers(runner worker.Runner, st *api.State,
   260  	tag string, containers []instance.ContainerType) error {
   261  
   262  	var machine *apiprovisioner.Machine
   263  	var err error
   264  	pr := st.Provisioner()
   265  	if machine, err = pr.Machine(tag); err != nil {
   266  		return fmt.Errorf("%s is not in state: %v", tag, err)
   267  	}
   268  	if len(containers) == 0 {
   269  		if err := machine.SupportsNoContainers(); err != nil {
   270  			return fmt.Errorf("clearing supported containers for %s: %v", tag, err)
   271  		}
   272  		return nil
   273  	}
   274  	if err := machine.SetSupportedContainers(containers...); err != nil {
   275  		return fmt.Errorf("setting supported containers for %s: %v", tag, err)
   276  	}
   277  	// Start the watcher to fire when a container is first requested on the machine.
   278  	watcherName := fmt.Sprintf("%s-container-watcher", machine.Id())
   279  	handler := provisioner.NewContainerSetupHandler(runner, watcherName, containers, machine, pr, a.Conf.config)
   280  	runner.StartWorker(watcherName, func() (worker.Worker, error) {
   281  		return worker.NewStringsWorker(handler), nil
   282  	})
   283  	return nil
   284  }
   285  
   286  // StateJobs returns a worker running all the workers that require
   287  // a *state.State connection.
   288  func (a *MachineAgent) StateWorker() (worker.Worker, error) {
   289  	agentConfig := a.Conf.config
   290  	st, entity, err := openState(agentConfig, a)
   291  	if err != nil {
   292  		return nil, err
   293  	}
   294  	reportOpenedState(st)
   295  	m := entity.(*state.Machine)
   296  
   297  	runner := newRunner(connectionIsFatal(st), moreImportant)
   298  	// Take advantage of special knowledge here in that we will only ever want
   299  	// the storage provider on one machine, and that is the "bootstrap" node.
   300  	providerType := agentConfig.Value(agent.ProviderType)
   301  	if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId {
   302  		runner.StartWorker("local-storage", func() (worker.Worker, error) {
   303  			// TODO(axw) 2013-09-24 bug #1229507
   304  			// Make another job to enable storage.
   305  			// There's nothing special about this.
   306  			return localstorage.NewWorker(agentConfig), nil
   307  		})
   308  	}
   309  	for _, job := range m.Jobs() {
   310  		switch job {
   311  		case state.JobHostUnits:
   312  			// Implemented in APIWorker.
   313  		case state.JobManageEnviron:
   314  			runner.StartWorker("instancepoller", func() (worker.Worker, error) {
   315  				return instancepoller.NewWorker(st), nil
   316  			})
   317  			runner.StartWorker("apiserver", func() (worker.Worker, error) {
   318  				// If the configuration does not have the required information,
   319  				// it is currently not a recoverable error, so we kill the whole
   320  				// agent, potentially enabling human intervention to fix
   321  				// the agent's configuration file. In the future, we may retrieve
   322  				// the state server certificate and key from the state, and
   323  				// this should then change.
   324  				port, cert, key := a.Conf.config.APIServerDetails()
   325  				if len(cert) == 0 || len(key) == 0 {
   326  					return nil, &fatalError{"configuration does not have state server cert/key"}
   327  				}
   328  				dataDir := a.Conf.config.DataDir()
   329  				return apiserver.NewServer(st, fmt.Sprintf(":%d", port), cert, key, dataDir)
   330  			})
   331  			runner.StartWorker("cleaner", func() (worker.Worker, error) {
   332  				return cleaner.NewCleaner(st), nil
   333  			})
   334  			runner.StartWorker("resumer", func() (worker.Worker, error) {
   335  				// The action of resumer is so subtle that it is not tested,
   336  				// because we can't figure out how to do so without brutalising
   337  				// the transaction log.
   338  				return resumer.NewResumer(st), nil
   339  			})
   340  			runner.StartWorker("minunitsworker", func() (worker.Worker, error) {
   341  				return minunitsworker.NewMinUnitsWorker(st), nil
   342  			})
   343  		case state.JobManageState:
   344  			// Legacy environments may set this, but we ignore it.
   345  		default:
   346  			logger.Warningf("ignoring unknown job %q", job)
   347  		}
   348  	}
   349  	return newCloseWorker(runner, st), nil
   350  }
   351  
   352  func (a *MachineAgent) Entity(st *state.State) (AgentState, error) {
   353  	m, err := st.Machine(a.MachineId)
   354  	if err != nil {
   355  		return nil, err
   356  	}
   357  	// Check the machine nonce as provisioned matches the agent.Conf value.
   358  	if !m.CheckProvisioned(a.Conf.config.Nonce()) {
   359  		// The agent is running on a different machine to the one it
   360  		// should be according to state. It must stop immediately.
   361  		logger.Errorf("running machine %v agent on inappropriate instance", m)
   362  		return nil, worker.ErrTerminateAgent
   363  	}
   364  	return m, nil
   365  }
   366  
   367  func (a *MachineAgent) Tag() string {
   368  	return names.MachineTag(a.MachineId)
   369  }
   370  
   371  func (a *MachineAgent) initAgent() error {
   372  	if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) {
   373  		return err
   374  	}
   375  	jujud := filepath.Join(a.Conf.dataDir, "tools", a.Tag(), "jujud")
   376  	return os.Symlink(jujud, jujuRun)
   377  }
   378  
   379  func (a *MachineAgent) uninstallAgent() error {
   380  	var errors []error
   381  	agentServiceName := a.Conf.config.Value(agent.AgentServiceName)
   382  	if agentServiceName == "" {
   383  		// For backwards compatibility, handle lack of AgentServiceName.
   384  		agentServiceName = os.Getenv("UPSTART_JOB")
   385  	}
   386  	if agentServiceName != "" {
   387  		if err := upstart.NewService(agentServiceName).Remove(); err != nil {
   388  			errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err))
   389  		}
   390  	}
   391  	// Remove the rsyslog conf file and restart rsyslogd.
   392  	if rsyslogConfPath := a.Conf.config.Value(agent.RsyslogConfPath); rsyslogConfPath != "" {
   393  		if err := os.Remove(rsyslogConfPath); err != nil {
   394  			errors = append(errors, err)
   395  		}
   396  		if err := syslog.Restart(); err != nil {
   397  			errors = append(errors, err)
   398  		}
   399  	}
   400  	// Remove the juju-run symlink.
   401  	if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) {
   402  		errors = append(errors, err)
   403  	}
   404  	// The machine agent may terminate without knowing its jobs,
   405  	// for example if the machine's entry in state was removed.
   406  	// Thus, we do not rely on jobs here, and instead just check
   407  	// if the upstart config exists.
   408  	mongoServiceName := a.Conf.config.Value(agent.MongoServiceName)
   409  	if mongoServiceName != "" {
   410  		if err := upstart.NewService(mongoServiceName).StopAndRemove(); err != nil {
   411  			errors = append(errors, fmt.Errorf("cannot stop/remove service %q: %v", mongoServiceName, err))
   412  		}
   413  	}
   414  	if err := os.RemoveAll(a.Conf.dataDir); err != nil {
   415  		errors = append(errors, err)
   416  	}
   417  	if len(errors) == 0 {
   418  		return nil
   419  	}
   420  	return fmt.Errorf("uninstall failed: %v", errors)
   421  }
   422  
   423  // Below pieces are used for testing,to give us access to the *State opened
   424  // by the agent, and allow us to trigger syncs without waiting 5s for them
   425  // to happen automatically.
   426  
   427  var stateReporter chan<- *state.State
   428  
   429  func reportOpenedState(st *state.State) {
   430  	select {
   431  	case stateReporter <- st:
   432  	default:
   433  	}
   434  }
   435  
   436  func sendOpenedStates(dst chan<- *state.State) (undo func()) {
   437  	var original chan<- *state.State
   438  	original, stateReporter = stateReporter, dst
   439  	return func() { stateReporter = original }
   440  }
   441  
   442  var apiReporter chan<- *api.State
   443  
   444  func reportOpenedAPI(st *api.State) {
   445  	select {
   446  	case apiReporter <- st:
   447  	default:
   448  	}
   449  }
   450  func sendOpenedAPIs(dst chan<- *api.State) (undo func()) {
   451  	var original chan<- *api.State
   452  	original, apiReporter = apiReporter, dst
   453  	return func() { apiReporter = original }
   454  }