github.com/mattyw/juju@v0.0.0-20140610034352-732aecd63861/cmd/jujud/machine.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package main
     5  
     6  import (
     7  	"fmt"
     8  	"net"
     9  	"os"
    10  	"path/filepath"
    11  	"runtime"
    12  	"time"
    13  
    14  	"github.com/juju/errors"
    15  	"github.com/juju/loggo"
    16  	"github.com/juju/names"
    17  	"github.com/juju/utils"
    18  	"github.com/juju/utils/voyeur"
    19  	"labix.org/v2/mgo"
    20  	"launchpad.net/gnuflag"
    21  	"launchpad.net/tomb"
    22  
    23  	"github.com/juju/juju/agent"
    24  	"github.com/juju/juju/agent/mongo"
    25  	"github.com/juju/juju/charm"
    26  	"github.com/juju/juju/cmd"
    27  	"github.com/juju/juju/container/kvm"
    28  	"github.com/juju/juju/environs"
    29  	"github.com/juju/juju/instance"
    30  	"github.com/juju/juju/provider"
    31  	"github.com/juju/juju/state"
    32  	"github.com/juju/juju/state/api"
    33  	apiagent "github.com/juju/juju/state/api/agent"
    34  	"github.com/juju/juju/state/api/params"
    35  	"github.com/juju/juju/state/apiserver"
    36  	"github.com/juju/juju/upgrades"
    37  	"github.com/juju/juju/upstart"
    38  	"github.com/juju/juju/version"
    39  	"github.com/juju/juju/worker"
    40  	"github.com/juju/juju/worker/apiaddressupdater"
    41  	"github.com/juju/juju/worker/authenticationworker"
    42  	"github.com/juju/juju/worker/charmrevisionworker"
    43  	"github.com/juju/juju/worker/cleaner"
    44  	"github.com/juju/juju/worker/deployer"
    45  	"github.com/juju/juju/worker/firewaller"
    46  	"github.com/juju/juju/worker/instancepoller"
    47  	"github.com/juju/juju/worker/localstorage"
    48  	workerlogger "github.com/juju/juju/worker/logger"
    49  	"github.com/juju/juju/worker/machineenvironmentworker"
    50  	"github.com/juju/juju/worker/machiner"
    51  	"github.com/juju/juju/worker/minunitsworker"
    52  	"github.com/juju/juju/worker/peergrouper"
    53  	"github.com/juju/juju/worker/provisioner"
    54  	"github.com/juju/juju/worker/resumer"
    55  	"github.com/juju/juju/worker/rsyslog"
    56  	"github.com/juju/juju/worker/singular"
    57  	"github.com/juju/juju/worker/terminationworker"
    58  	"github.com/juju/juju/worker/upgrader"
    59  )
    60  
    61  var logger = loggo.GetLogger("juju.cmd.jujud")
    62  
    63  var newRunner = worker.NewRunner
    64  
    65  const bootstrapMachineId = "0"
    66  
    67  // eitherState can be either a *state.State or a *api.State.
    68  type eitherState interface{}
    69  
    70  var (
    71  	retryDelay      = 3 * time.Second
    72  	jujuRun         = "/usr/local/bin/juju-run"
    73  	useMultipleCPUs = utils.UseMultipleCPUs
    74  
    75  	// The following are defined as variables to
    76  	// allow the tests to intercept calls to the functions.
    77  	ensureMongoServer        = mongo.EnsureServer
    78  	maybeInitiateMongoServer = peergrouper.MaybeInitiateMongoServer
    79  	ensureMongoAdminUser     = mongo.EnsureAdminUser
    80  	newSingularRunner        = singular.New
    81  	peergrouperNew           = peergrouper.New
    82  
    83  	// reportOpenedAPI is exposed for tests to know when
    84  	// the State has been successfully opened.
    85  	reportOpenedState = func(eitherState) {}
    86  
    87  	// reportOpenedAPI is exposed for tests to know when
    88  	// the API has been successfully opened.
    89  	reportOpenedAPI = func(eitherState) {}
    90  )
    91  
    92  // MachineAgent is a cmd.Command responsible for running a machine agent.
    93  type MachineAgent struct {
    94  	cmd.CommandBase
    95  	tomb tomb.Tomb
    96  	AgentConf
    97  	MachineId        string
    98  	runner           worker.Runner
    99  	configChangedVal voyeur.Value
   100  	upgradeComplete  chan struct{}
   101  	workersStarted   chan struct{}
   102  	st               *state.State
   103  }
   104  
   105  // Info returns usage information for the command.
   106  func (a *MachineAgent) Info() *cmd.Info {
   107  	return &cmd.Info{
   108  		Name:    "machine",
   109  		Purpose: "run a juju machine agent",
   110  	}
   111  }
   112  
   113  func (a *MachineAgent) SetFlags(f *gnuflag.FlagSet) {
   114  	a.AgentConf.AddFlags(f)
   115  	f.StringVar(&a.MachineId, "machine-id", "", "id of the machine to run")
   116  }
   117  
   118  // Init initializes the command for running.
   119  func (a *MachineAgent) Init(args []string) error {
   120  	if !names.IsMachine(a.MachineId) {
   121  		return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer")
   122  	}
   123  	if err := a.AgentConf.CheckArgs(args); err != nil {
   124  		return err
   125  	}
   126  	a.runner = newRunner(isFatal, moreImportant)
   127  	a.upgradeComplete = make(chan struct{})
   128  	a.workersStarted = make(chan struct{})
   129  	return nil
   130  }
   131  
   132  // Wait waits for the machine agent to finish.
   133  func (a *MachineAgent) Wait() error {
   134  	return a.tomb.Wait()
   135  }
   136  
   137  // Stop stops the machine agent.
   138  func (a *MachineAgent) Stop() error {
   139  	a.runner.Kill()
   140  	return a.tomb.Wait()
   141  }
   142  
   143  // Run runs a machine agent.
   144  func (a *MachineAgent) Run(_ *cmd.Context) error {
   145  	// Due to changes in the logging, and needing to care about old
   146  	// environments that have been upgraded, we need to explicitly remove the
   147  	// file writer if one has been added, otherwise we will get duplicate
   148  	// lines of all logging in the log file.
   149  	loggo.RemoveWriter("logfile")
   150  	defer a.tomb.Done()
   151  	logger.Infof("machine agent %v start (%s [%s])", a.Tag(), version.Current, runtime.Compiler)
   152  	if err := a.ReadConfig(a.Tag()); err != nil {
   153  		return fmt.Errorf("cannot read agent configuration: %v", err)
   154  	}
   155  	a.configChangedVal.Set(struct{}{})
   156  	agentConfig := a.CurrentConfig()
   157  	charm.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache")
   158  	if err := a.createJujuRun(agentConfig.DataDir()); err != nil {
   159  		return fmt.Errorf("cannot create juju run symlink: %v", err)
   160  	}
   161  	a.runner.StartWorker("api", a.APIWorker)
   162  	a.runner.StartWorker("statestarter", a.newStateStarterWorker)
   163  	a.runner.StartWorker("termination", func() (worker.Worker, error) {
   164  		return terminationworker.NewWorker(), nil
   165  	})
   166  	// At this point, all workers will have been configured to start
   167  	close(a.workersStarted)
   168  	err := a.runner.Wait()
   169  	if err == worker.ErrTerminateAgent {
   170  		err = a.uninstallAgent(agentConfig)
   171  	}
   172  	err = agentDone(err)
   173  	a.tomb.Kill(err)
   174  	return err
   175  }
   176  
   177  func (a *MachineAgent) ChangeConfig(mutate func(config agent.ConfigSetter)) error {
   178  	err := a.AgentConf.ChangeConfig(mutate)
   179  	a.configChangedVal.Set(struct{}{})
   180  	return err
   181  }
   182  
   183  // newStateStarterWorker wraps stateStarter in a simple worker for use in
   184  // a.runner.StartWorker.
   185  func (a *MachineAgent) newStateStarterWorker() (worker.Worker, error) {
   186  	return worker.NewSimpleWorker(a.stateStarter), nil
   187  }
   188  
   189  // stateStarter watches for changes to the agent configuration, and
   190  // starts or stops the state worker as appropriate. We watch the agent
   191  // configuration because the agent configuration has all the details
   192  // that we need to start a state server, whether they have been cached
   193  // or read from the state.
   194  //
   195  // It will stop working as soon as stopch is closed.
   196  func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error {
   197  	confWatch := a.configChangedVal.Watch()
   198  	defer confWatch.Close()
   199  	watchCh := make(chan struct{})
   200  	go func() {
   201  		for confWatch.Next() {
   202  			watchCh <- struct{}{}
   203  		}
   204  	}()
   205  	for {
   206  		select {
   207  		case <-watchCh:
   208  			agentConfig := a.CurrentConfig()
   209  
   210  			// N.B. StartWorker and StopWorker are idempotent.
   211  			_, ok := agentConfig.StateServingInfo()
   212  			if ok {
   213  				a.runner.StartWorker("state", func() (worker.Worker, error) {
   214  					return a.StateWorker()
   215  				})
   216  			} else {
   217  				a.runner.StopWorker("state")
   218  			}
   219  		case <-stopch:
   220  			return nil
   221  		}
   222  	}
   223  }
   224  
   225  // APIWorker returns a Worker that connects to the API and starts any
   226  // workers that need an API connection.
   227  func (a *MachineAgent) APIWorker() (worker.Worker, error) {
   228  	agentConfig := a.CurrentConfig()
   229  	st, entity, err := openAPIState(agentConfig, a)
   230  	if err != nil {
   231  		return nil, err
   232  	}
   233  	reportOpenedAPI(st)
   234  
   235  	// Refresh the configuration, since it may have been updated after opening state.
   236  	agentConfig = a.CurrentConfig()
   237  
   238  	for _, job := range entity.Jobs() {
   239  		if job.NeedsState() {
   240  			info, err := st.Agent().StateServingInfo()
   241  			if err != nil {
   242  				return nil, fmt.Errorf("cannot get state serving info: %v", err)
   243  			}
   244  			err = a.ChangeConfig(func(config agent.ConfigSetter) {
   245  				config.SetStateServingInfo(info)
   246  			})
   247  			if err != nil {
   248  				return nil, err
   249  			}
   250  			agentConfig = a.CurrentConfig()
   251  			break
   252  		}
   253  	}
   254  
   255  	rsyslogMode := rsyslog.RsyslogModeForwarding
   256  	runner := newRunner(connectionIsFatal(st), moreImportant)
   257  	var singularRunner worker.Runner
   258  	for _, job := range entity.Jobs() {
   259  		if job == params.JobManageEnviron {
   260  			rsyslogMode = rsyslog.RsyslogModeAccumulate
   261  			conn := singularAPIConn{st, st.Agent()}
   262  			singularRunner, err = newSingularRunner(runner, conn)
   263  			if err != nil {
   264  				return nil, fmt.Errorf("cannot make singular API Runner: %v", err)
   265  			}
   266  			break
   267  		}
   268  	}
   269  
   270  	// Run the upgrader and the upgrade-steps worker without waiting for
   271  	// the upgrade steps to complete.
   272  	runner.StartWorker("upgrader", func() (worker.Worker, error) {
   273  		return upgrader.NewUpgrader(st.Upgrader(), agentConfig), nil
   274  	})
   275  	runner.StartWorker("upgrade-steps", func() (worker.Worker, error) {
   276  		return a.upgradeWorker(st, entity.Jobs(), agentConfig), nil
   277  	})
   278  
   279  	// All other workers must wait for the upgrade steps to complete
   280  	// before starting.
   281  	a.startWorkerAfterUpgrade(runner, "machiner", func() (worker.Worker, error) {
   282  		return machiner.NewMachiner(st.Machiner(), agentConfig), nil
   283  	})
   284  	a.startWorkerAfterUpgrade(runner, "apiaddressupdater", func() (worker.Worker, error) {
   285  		return apiaddressupdater.NewAPIAddressUpdater(st.Machiner(), a), nil
   286  	})
   287  	a.startWorkerAfterUpgrade(runner, "logger", func() (worker.Worker, error) {
   288  		return workerlogger.NewLogger(st.Logger(), agentConfig), nil
   289  	})
   290  	a.startWorkerAfterUpgrade(runner, "machineenvironmentworker", func() (worker.Worker, error) {
   291  		return machineenvironmentworker.NewMachineEnvironmentWorker(st.Environment(), agentConfig), nil
   292  	})
   293  	a.startWorkerAfterUpgrade(runner, "rsyslog", func() (worker.Worker, error) {
   294  		return newRsyslogConfigWorker(st.Rsyslog(), agentConfig, rsyslogMode)
   295  	})
   296  
   297  	// If not a local provider bootstrap machine, start the worker to
   298  	// manage SSH keys.
   299  	providerType := agentConfig.Value(agent.ProviderType)
   300  	if providerType != provider.Local || a.MachineId != bootstrapMachineId {
   301  		a.startWorkerAfterUpgrade(runner, "authenticationworker", func() (worker.Worker, error) {
   302  			return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil
   303  		})
   304  	}
   305  
   306  	// Perform the operations needed to set up hosting for containers.
   307  	if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil {
   308  		return nil, fmt.Errorf("setting up container support: %v", err)
   309  	}
   310  	for _, job := range entity.Jobs() {
   311  		switch job {
   312  		case params.JobHostUnits:
   313  			a.startWorkerAfterUpgrade(runner, "deployer", func() (worker.Worker, error) {
   314  				apiDeployer := st.Deployer()
   315  				context := newDeployContext(apiDeployer, agentConfig)
   316  				return deployer.NewDeployer(apiDeployer, context), nil
   317  			})
   318  		case params.JobManageEnviron:
   319  			a.startWorkerAfterUpgrade(singularRunner, "environ-provisioner", func() (worker.Worker, error) {
   320  				return provisioner.NewEnvironProvisioner(st.Provisioner(), agentConfig), nil
   321  			})
   322  			// TODO(axw) 2013-09-24 bug #1229506
   323  			// Make another job to enable the firewaller. Not all
   324  			// environments are capable of managing ports
   325  			// centrally.
   326  			a.startWorkerAfterUpgrade(singularRunner, "firewaller", func() (worker.Worker, error) {
   327  				return firewaller.NewFirewaller(st.Firewaller())
   328  			})
   329  			a.startWorkerAfterUpgrade(singularRunner, "charm-revision-updater", func() (worker.Worker, error) {
   330  				return charmrevisionworker.NewRevisionUpdateWorker(st.CharmRevisionUpdater()), nil
   331  			})
   332  		case params.JobManageStateDeprecated:
   333  			// Legacy environments may set this, but we ignore it.
   334  		default:
   335  			// TODO(dimitern): Once all workers moved over to using
   336  			// the API, report "unknown job type" here.
   337  		}
   338  	}
   339  	return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker.
   340  }
   341  
   342  // setupContainerSupport determines what containers can be run on this machine and
   343  // initialises suitable infrastructure to support such containers.
   344  func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity, agentConfig agent.Config) error {
   345  	var supportedContainers []instance.ContainerType
   346  	// We don't yet support nested lxc containers but anything else can run an LXC container.
   347  	if entity.ContainerType() != instance.LXC {
   348  		supportedContainers = append(supportedContainers, instance.LXC)
   349  	}
   350  	supportsKvm, err := kvm.IsKVMSupported()
   351  	if err != nil {
   352  		logger.Warningf("determining kvm support: %v\nno kvm containers possible", err)
   353  	}
   354  	if err == nil && supportsKvm {
   355  		supportedContainers = append(supportedContainers, instance.KVM)
   356  	}
   357  	return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig)
   358  }
   359  
   360  // updateSupportedContainers records in state that a machine can run the specified containers.
   361  // It starts a watcher and when a container of a given type is first added to the machine,
   362  // the watcher is killed, the machine is set up to be able to start containers of the given type,
   363  // and a suitable provisioner is started.
   364  func (a *MachineAgent) updateSupportedContainers(
   365  	runner worker.Runner,
   366  	st *api.State,
   367  	tag string,
   368  	containers []instance.ContainerType,
   369  	agentConfig agent.Config,
   370  ) error {
   371  	pr := st.Provisioner()
   372  	machine, err := pr.Machine(tag)
   373  	if err != nil {
   374  		return fmt.Errorf("%s is not in state: %v", tag, err)
   375  	}
   376  	if len(containers) == 0 {
   377  		if err := machine.SupportsNoContainers(); err != nil {
   378  			return fmt.Errorf("clearing supported containers for %s: %v", tag, err)
   379  		}
   380  		return nil
   381  	}
   382  	if err := machine.SetSupportedContainers(containers...); err != nil {
   383  		return fmt.Errorf("setting supported containers for %s: %v", tag, err)
   384  	}
   385  	initLock, err := hookExecutionLock(agentConfig.DataDir())
   386  	if err != nil {
   387  		return err
   388  	}
   389  	// Start the watcher to fire when a container is first requested on the machine.
   390  	watcherName := fmt.Sprintf("%s-container-watcher", machine.Id())
   391  	handler := provisioner.NewContainerSetupHandler(
   392  		runner,
   393  		watcherName,
   394  		containers,
   395  		machine,
   396  		pr,
   397  		agentConfig,
   398  		initLock,
   399  	)
   400  	a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) {
   401  		return worker.NewStringsWorker(handler), nil
   402  	})
   403  	return nil
   404  }
   405  
   406  // StateWorker returns a worker running all the workers that require
   407  // a *state.State connection.
   408  func (a *MachineAgent) StateWorker() (worker.Worker, error) {
   409  	agentConfig := a.CurrentConfig()
   410  
   411  	// Create system-identity file
   412  	if err := agent.WriteSystemIdentityFile(agentConfig); err != nil {
   413  		return nil, err
   414  	}
   415  
   416  	// Start MondoDB server
   417  	if err := a.ensureMongoServer(agentConfig); err != nil {
   418  		return nil, err
   419  	}
   420  	st, m, err := openState(agentConfig)
   421  	if err != nil {
   422  		return nil, err
   423  	}
   424  	reportOpenedState(st)
   425  
   426  	singularStateConn := singularStateConn{st.MongoSession(), m}
   427  	runner := newRunner(connectionIsFatal(st), moreImportant)
   428  	singularRunner, err := newSingularRunner(runner, singularStateConn)
   429  	if err != nil {
   430  		return nil, fmt.Errorf("cannot make singular State Runner: %v", err)
   431  	}
   432  
   433  	// Take advantage of special knowledge here in that we will only ever want
   434  	// the storage provider on one machine, and that is the "bootstrap" node.
   435  	providerType := agentConfig.Value(agent.ProviderType)
   436  	if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId {
   437  		a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) {
   438  			// TODO(axw) 2013-09-24 bug #1229507
   439  			// Make another job to enable storage.
   440  			// There's nothing special about this.
   441  			return localstorage.NewWorker(agentConfig), nil
   442  		})
   443  	}
   444  	for _, job := range m.Jobs() {
   445  		switch job {
   446  		case state.JobHostUnits:
   447  			// Implemented in APIWorker.
   448  		case state.JobManageEnviron:
   449  			useMultipleCPUs()
   450  			a.startWorkerAfterUpgrade(runner, "instancepoller", func() (worker.Worker, error) {
   451  				return instancepoller.NewWorker(st), nil
   452  			})
   453  			if shouldEnableHA(agentConfig) {
   454  				a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) {
   455  					return peergrouperNew(st)
   456  				})
   457  			}
   458  			runner.StartWorker("apiserver", func() (worker.Worker, error) {
   459  				// If the configuration does not have the required information,
   460  				// it is currently not a recoverable error, so we kill the whole
   461  				// agent, potentially enabling human intervention to fix
   462  				// the agent's configuration file. In the future, we may retrieve
   463  				// the state server certificate and key from the state, and
   464  				// this should then change.
   465  				info, ok := agentConfig.StateServingInfo()
   466  				if !ok {
   467  					return nil, &fatalError{"StateServingInfo not available and we need it"}
   468  				}
   469  				port := info.APIPort
   470  				cert := []byte(info.Cert)
   471  				key := []byte(info.PrivateKey)
   472  
   473  				if len(cert) == 0 || len(key) == 0 {
   474  					return nil, &fatalError{"configuration does not have state server cert/key"}
   475  				}
   476  				dataDir := agentConfig.DataDir()
   477  				logDir := agentConfig.LogDir()
   478  				return apiserver.NewServer(
   479  					st, fmt.Sprintf(":%d", port), cert, key, dataDir, logDir)
   480  			})
   481  			a.startWorkerAfterUpgrade(singularRunner, "cleaner", func() (worker.Worker, error) {
   482  				return cleaner.NewCleaner(st), nil
   483  			})
   484  			a.startWorkerAfterUpgrade(singularRunner, "resumer", func() (worker.Worker, error) {
   485  				// The action of resumer is so subtle that it is not tested,
   486  				// because we can't figure out how to do so without brutalising
   487  				// the transaction log.
   488  				return resumer.NewResumer(st), nil
   489  			})
   490  			a.startWorkerAfterUpgrade(singularRunner, "minunitsworker", func() (worker.Worker, error) {
   491  				return minunitsworker.NewMinUnitsWorker(st), nil
   492  			})
   493  		case state.JobManageStateDeprecated:
   494  			// Legacy environments may set this, but we ignore it.
   495  		default:
   496  			logger.Warningf("ignoring unknown job %q", job)
   497  		}
   498  	}
   499  	return newCloseWorker(runner, st), nil
   500  }
   501  
   502  // ensureMongoServer ensures that mongo is installed and running,
   503  // and ready for opening a state connection.
   504  func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) error {
   505  	servingInfo, ok := agentConfig.StateServingInfo()
   506  	if !ok {
   507  		return fmt.Errorf("state worker was started with no state serving info")
   508  	}
   509  	namespace := agentConfig.Value(agent.Namespace)
   510  	withHA := shouldEnableHA(agentConfig)
   511  
   512  	// When upgrading from a pre-HA-capable environment,
   513  	// we must add machine-0 to the admin database and
   514  	// initiate its replicaset.
   515  	//
   516  	// TODO(axw) remove this when we no longer need
   517  	// to upgrade from pre-HA-capable environments.
   518  	var shouldInitiateMongoServer bool
   519  	var addrs []instance.Address
   520  	if isPreHAVersion(agentConfig.UpgradedToVersion()) {
   521  		_, err := a.ensureMongoAdminUser(agentConfig)
   522  		if err != nil {
   523  			return err
   524  		}
   525  		if servingInfo.SharedSecret == "" {
   526  			servingInfo.SharedSecret, err = mongo.GenerateSharedSecret()
   527  			if err != nil {
   528  				return err
   529  			}
   530  			if err = a.ChangeConfig(func(config agent.ConfigSetter) {
   531  				config.SetStateServingInfo(servingInfo)
   532  			}); err != nil {
   533  				return err
   534  			}
   535  			agentConfig = a.CurrentConfig()
   536  		}
   537  		st, m, err := openState(agentConfig)
   538  		if err != nil {
   539  			return err
   540  		}
   541  		if err := st.SetStateServingInfo(servingInfo); err != nil {
   542  			st.Close()
   543  			return fmt.Errorf("cannot set state serving info: %v", err)
   544  		}
   545  		st.Close()
   546  		addrs = m.Addresses()
   547  		shouldInitiateMongoServer = withHA
   548  	}
   549  
   550  	// ensureMongoServer installs/upgrades the upstart config as necessary.
   551  	if err := ensureMongoServer(
   552  		agentConfig.DataDir(),
   553  		namespace,
   554  		servingInfo,
   555  		withHA,
   556  	); err != nil {
   557  		return err
   558  	}
   559  	if !shouldInitiateMongoServer {
   560  		return nil
   561  	}
   562  
   563  	// Initiate the replicaset for upgraded environments.
   564  	//
   565  	// TODO(axw) remove this when we no longer need
   566  	// to upgrade from pre-HA-capable environments.
   567  	stateInfo, ok := agentConfig.StateInfo()
   568  	if !ok {
   569  		return fmt.Errorf("state worker was started with no state serving info")
   570  	}
   571  	dialInfo, err := state.DialInfo(stateInfo, state.DefaultDialOpts())
   572  	if err != nil {
   573  		return err
   574  	}
   575  	peerAddr := mongo.SelectPeerAddress(addrs)
   576  	if peerAddr == "" {
   577  		return fmt.Errorf("no appropriate peer address found in %q", addrs)
   578  	}
   579  	return maybeInitiateMongoServer(peergrouper.InitiateMongoParams{
   580  		DialInfo:       dialInfo,
   581  		MemberHostPort: net.JoinHostPort(peerAddr, fmt.Sprint(servingInfo.StatePort)),
   582  		User:           stateInfo.Tag,
   583  		Password:       stateInfo.Password,
   584  	})
   585  }
   586  
   587  func (a *MachineAgent) ensureMongoAdminUser(agentConfig agent.Config) (added bool, err error) {
   588  	stateInfo, ok1 := agentConfig.StateInfo()
   589  	servingInfo, ok2 := agentConfig.StateServingInfo()
   590  	if !ok1 || !ok2 {
   591  		return false, fmt.Errorf("no state serving info configuration")
   592  	}
   593  	dialInfo, err := state.DialInfo(stateInfo, state.DefaultDialOpts())
   594  	if err != nil {
   595  		return false, err
   596  	}
   597  	if len(dialInfo.Addrs) > 1 {
   598  		logger.Infof("more than one state server; admin user must exist")
   599  		return false, nil
   600  	}
   601  	return ensureMongoAdminUser(mongo.EnsureAdminUserParams{
   602  		DialInfo:  dialInfo,
   603  		Namespace: agentConfig.Value(agent.Namespace),
   604  		DataDir:   agentConfig.DataDir(),
   605  		Port:      servingInfo.StatePort,
   606  		User:      stateInfo.Tag,
   607  		Password:  stateInfo.Password,
   608  	})
   609  }
   610  
   611  func isPreHAVersion(v version.Number) bool {
   612  	return v.Compare(version.MustParse("1.19.0")) < 0
   613  }
   614  
   615  // shouldEnableHA reports whether HA should be enabled.
   616  //
   617  // Eventually this should always be true, and ideally
   618  // it should be true before 1.20 is released or we'll
   619  // have more upgrade scenarios on our hands.
   620  func shouldEnableHA(agentConfig agent.Config) bool {
   621  	providerType := agentConfig.Value(agent.ProviderType)
   622  	return providerType != provider.Local
   623  }
   624  
   625  func openState(agentConfig agent.Config) (_ *state.State, _ *state.Machine, err error) {
   626  	info, ok := agentConfig.StateInfo()
   627  	if !ok {
   628  		return nil, nil, fmt.Errorf("no state info available")
   629  	}
   630  	st, err := state.Open(info, state.DialOpts{}, environs.NewStatePolicy())
   631  	if err != nil {
   632  		return nil, nil, err
   633  	}
   634  	defer func() {
   635  		if err != nil {
   636  			st.Close()
   637  		}
   638  	}()
   639  	m0, err := st.FindEntity(agentConfig.Tag())
   640  	if err != nil {
   641  		if errors.IsNotFound(err) {
   642  			err = worker.ErrTerminateAgent
   643  		}
   644  		return nil, nil, err
   645  	}
   646  	m := m0.(*state.Machine)
   647  	if m.Life() == state.Dead {
   648  		return nil, nil, worker.ErrTerminateAgent
   649  	}
   650  	// Check the machine nonce as provisioned matches the agent.Conf value.
   651  	if !m.CheckProvisioned(agentConfig.Nonce()) {
   652  		// The agent is running on a different machine to the one it
   653  		// should be according to state. It must stop immediately.
   654  		logger.Errorf("running machine %v agent on inappropriate instance", m)
   655  		return nil, nil, worker.ErrTerminateAgent
   656  	}
   657  	return st, m, nil
   658  }
   659  
   660  // startWorkerAfterUpgrade starts a worker to run the specified child worker
   661  // but only after waiting for upgrades to complete.
   662  func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) {
   663  	runner.StartWorker(name, func() (worker.Worker, error) {
   664  		return a.upgradeWaiterWorker(start), nil
   665  	})
   666  }
   667  
   668  // upgradeWaiterWorker runs the specified worker after upgrades have completed.
   669  func (a *MachineAgent) upgradeWaiterWorker(start func() (worker.Worker, error)) worker.Worker {
   670  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   671  		// wait for the upgrade to complete (or for us to be stopped)
   672  		select {
   673  		case <-stop:
   674  			return nil
   675  		case <-a.upgradeComplete:
   676  		}
   677  		w, err := start()
   678  		if err != nil {
   679  			return err
   680  		}
   681  		waitCh := make(chan error)
   682  		go func() {
   683  			waitCh <- w.Wait()
   684  		}()
   685  		select {
   686  		case err := <-waitCh:
   687  			return err
   688  		case <-stop:
   689  			w.Kill()
   690  		}
   691  		return <-waitCh
   692  	})
   693  }
   694  
   695  // upgradeWorker runs the required upgrade operations to upgrade to the current Juju version.
   696  func (a *MachineAgent) upgradeWorker(
   697  	apiState *api.State,
   698  	jobs []params.MachineJob,
   699  	agentConfig agent.Config,
   700  ) worker.Worker {
   701  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   702  		select {
   703  		case <-a.upgradeComplete:
   704  			// Our work is already done (we're probably being restarted
   705  			// because the API connection has gone down), so do nothing.
   706  			<-stop
   707  			return nil
   708  		default:
   709  		}
   710  		// If the machine agent is a state server, flag that state
   711  		// needs to be opened before running upgrade steps
   712  		needsState := false
   713  		for _, job := range jobs {
   714  			if job == params.JobManageEnviron {
   715  				needsState = true
   716  			}
   717  		}
   718  		// We need a *state.State for upgrades. We open it independently
   719  		// of StateWorker, because we have no guarantees about when
   720  		// and how often StateWorker might run.
   721  		var st *state.State
   722  		if needsState {
   723  			var err error
   724  			info, ok := agentConfig.StateInfo()
   725  			if !ok {
   726  				return fmt.Errorf("no state info available")
   727  			}
   728  			st, err = state.Open(info, state.DialOpts{}, environs.NewStatePolicy())
   729  			if err != nil {
   730  				return err
   731  			}
   732  			defer st.Close()
   733  		}
   734  		err := a.runUpgrades(st, apiState, jobs, agentConfig)
   735  		if err != nil {
   736  			return err
   737  		}
   738  		logger.Infof("upgrade to %v completed.", version.Current)
   739  		close(a.upgradeComplete)
   740  		<-stop
   741  		return nil
   742  	})
   743  }
   744  
   745  // runUpgrades runs the upgrade operations for each job type and updates the updatedToVersion on success.
   746  func (a *MachineAgent) runUpgrades(
   747  	st *state.State,
   748  	apiState *api.State,
   749  	jobs []params.MachineJob,
   750  	agentConfig agent.Config,
   751  ) error {
   752  	from := version.Current
   753  	from.Number = agentConfig.UpgradedToVersion()
   754  	if from == version.Current {
   755  		logger.Infof("upgrade to %v already completed.", version.Current)
   756  		return nil
   757  	}
   758  	var err error
   759  	writeErr := a.ChangeConfig(func(agentConfig agent.ConfigSetter) {
   760  		context := upgrades.NewContext(agentConfig, apiState, st)
   761  		for _, job := range jobs {
   762  			target := upgradeTarget(job)
   763  			if target == "" {
   764  				continue
   765  			}
   766  			logger.Infof("starting upgrade from %v to %v for %v %q", from, version.Current, target, a.Tag())
   767  			if err = upgrades.PerformUpgrade(from.Number, target, context); err != nil {
   768  				err = fmt.Errorf("cannot perform upgrade from %v to %v for %v %q: %v", from, version.Current, target, a.Tag(), err)
   769  				return
   770  			}
   771  		}
   772  		agentConfig.SetUpgradedToVersion(version.Current.Number)
   773  	})
   774  	if writeErr != nil {
   775  		return fmt.Errorf("cannot write updated agent configuration: %v", writeErr)
   776  	}
   777  	return nil
   778  }
   779  
   780  func upgradeTarget(job params.MachineJob) upgrades.Target {
   781  	switch job {
   782  	case params.JobManageEnviron:
   783  		return upgrades.StateServer
   784  	case params.JobHostUnits:
   785  		return upgrades.HostMachine
   786  	}
   787  	return ""
   788  }
   789  
   790  // WorkersStarted returns a channel that's closed once all top level workers
   791  // have been started. This is provided for testing purposes.
   792  func (a *MachineAgent) WorkersStarted() <-chan struct{} {
   793  	return a.workersStarted
   794  
   795  }
   796  
   797  func (a *MachineAgent) Tag() string {
   798  	return names.MachineTag(a.MachineId)
   799  }
   800  
   801  func (a *MachineAgent) createJujuRun(dataDir string) error {
   802  	// TODO do not remove the symlink if it already points
   803  	// to the right place.
   804  	if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) {
   805  		return err
   806  	}
   807  	jujud := filepath.Join(dataDir, "tools", a.Tag(), "jujud")
   808  	return os.Symlink(jujud, jujuRun)
   809  }
   810  
   811  func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error {
   812  	var errors []error
   813  	agentServiceName := agentConfig.Value(agent.AgentServiceName)
   814  	if agentServiceName == "" {
   815  		// For backwards compatibility, handle lack of AgentServiceName.
   816  		agentServiceName = os.Getenv("UPSTART_JOB")
   817  	}
   818  	if agentServiceName != "" {
   819  		if err := upstart.NewService(agentServiceName).Remove(); err != nil {
   820  			errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err))
   821  		}
   822  	}
   823  	// Remove the juju-run symlink.
   824  	if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) {
   825  		errors = append(errors, err)
   826  	}
   827  
   828  	namespace := agentConfig.Value(agent.Namespace)
   829  	if err := mongo.RemoveService(namespace); err != nil {
   830  		errors = append(errors, fmt.Errorf("cannot stop/remove mongo service with namespace %q: %v", namespace, err))
   831  	}
   832  	if err := os.RemoveAll(agentConfig.DataDir()); err != nil {
   833  		errors = append(errors, err)
   834  	}
   835  	if len(errors) == 0 {
   836  		return nil
   837  	}
   838  	return fmt.Errorf("uninstall failed: %v", errors)
   839  }
   840  
   841  // singularAPIConn implements singular.Conn on
   842  // top of an API connection.
   843  type singularAPIConn struct {
   844  	apiState   *api.State
   845  	agentState *apiagent.State
   846  }
   847  
   848  func (c singularAPIConn) IsMaster() (bool, error) {
   849  	return c.agentState.IsMaster()
   850  }
   851  
   852  func (c singularAPIConn) Ping() error {
   853  	return c.apiState.Ping()
   854  }
   855  
   856  // singularStateConn implements singular.Conn on
   857  // top of a State connection.
   858  type singularStateConn struct {
   859  	session *mgo.Session
   860  	machine *state.Machine
   861  }
   862  
   863  func (c singularStateConn) IsMaster() (bool, error) {
   864  	return mongo.IsMaster(c.session, c.machine)
   865  }
   866  
   867  func (c singularStateConn) Ping() error {
   868  	return c.session.Ping()
   869  }