github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/cmd/jujud/agent/machine.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package agent
     5  
     6  import (
     7  	"fmt"
     8  	"net"
     9  	"os"
    10  	"path/filepath"
    11  	"runtime"
    12  	"strconv"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/juju/cmd"
    17  	"github.com/juju/errors"
    18  	"github.com/juju/loggo"
    19  	"github.com/juju/names"
    20  	"github.com/juju/utils"
    21  	"github.com/juju/utils/featureflag"
    22  	"github.com/juju/utils/symlink"
    23  	"github.com/juju/utils/voyeur"
    24  	"gopkg.in/juju/charm.v4"
    25  	"gopkg.in/mgo.v2"
    26  	"launchpad.net/gnuflag"
    27  	"launchpad.net/tomb"
    28  
    29  	"github.com/juju/juju/agent"
    30  	"github.com/juju/juju/api"
    31  	apiagent "github.com/juju/juju/api/agent"
    32  	apideployer "github.com/juju/juju/api/deployer"
    33  	"github.com/juju/juju/api/metricsmanager"
    34  	"github.com/juju/juju/apiserver"
    35  	"github.com/juju/juju/apiserver/params"
    36  	"github.com/juju/juju/cmd/jujud/reboot"
    37  	cmdutil "github.com/juju/juju/cmd/jujud/util"
    38  	"github.com/juju/juju/container"
    39  	"github.com/juju/juju/container/kvm"
    40  	"github.com/juju/juju/container/lxc"
    41  	"github.com/juju/juju/environs"
    42  	"github.com/juju/juju/environs/config"
    43  	"github.com/juju/juju/instance"
    44  	jujunames "github.com/juju/juju/juju/names"
    45  	"github.com/juju/juju/juju/paths"
    46  	"github.com/juju/juju/lease"
    47  	"github.com/juju/juju/mongo"
    48  	"github.com/juju/juju/network"
    49  	"github.com/juju/juju/provider"
    50  	"github.com/juju/juju/replicaset"
    51  	"github.com/juju/juju/service"
    52  	"github.com/juju/juju/service/common"
    53  	"github.com/juju/juju/state"
    54  	"github.com/juju/juju/state/multiwatcher"
    55  	statestorage "github.com/juju/juju/state/storage"
    56  	"github.com/juju/juju/storage"
    57  	coretools "github.com/juju/juju/tools"
    58  	"github.com/juju/juju/version"
    59  	"github.com/juju/juju/worker"
    60  	"github.com/juju/juju/worker/apiaddressupdater"
    61  	"github.com/juju/juju/worker/authenticationworker"
    62  	"github.com/juju/juju/worker/certupdater"
    63  	"github.com/juju/juju/worker/charmrevisionworker"
    64  	"github.com/juju/juju/worker/cleaner"
    65  	"github.com/juju/juju/worker/deployer"
    66  	"github.com/juju/juju/worker/diskmanager"
    67  	"github.com/juju/juju/worker/envworkermanager"
    68  	"github.com/juju/juju/worker/firewaller"
    69  	"github.com/juju/juju/worker/instancepoller"
    70  	"github.com/juju/juju/worker/localstorage"
    71  	workerlogger "github.com/juju/juju/worker/logger"
    72  	"github.com/juju/juju/worker/machiner"
    73  	"github.com/juju/juju/worker/metricworker"
    74  	"github.com/juju/juju/worker/minunitsworker"
    75  	"github.com/juju/juju/worker/networker"
    76  	"github.com/juju/juju/worker/peergrouper"
    77  	"github.com/juju/juju/worker/provisioner"
    78  	"github.com/juju/juju/worker/proxyupdater"
    79  	rebootworker "github.com/juju/juju/worker/reboot"
    80  	"github.com/juju/juju/worker/resumer"
    81  	"github.com/juju/juju/worker/rsyslog"
    82  	"github.com/juju/juju/worker/singular"
    83  	"github.com/juju/juju/worker/terminationworker"
    84  	"github.com/juju/juju/worker/upgrader"
    85  	"gopkg.in/natefinch/lumberjack.v2"
    86  )
    87  
    88  const bootstrapMachineId = "0"
    89  
    90  var (
    91  	logger     = loggo.GetLogger("juju.cmd.jujud")
    92  	retryDelay = 3 * time.Second
    93  	JujuRun    = paths.MustSucceed(paths.JujuRun(version.Current.Series))
    94  
    95  	// The following are defined as variables to allow the tests to
    96  	// intercept calls to the functions.
    97  	useMultipleCPUs          = utils.UseMultipleCPUs
    98  	maybeInitiateMongoServer = peergrouper.MaybeInitiateMongoServer
    99  	ensureMongoAdminUser     = mongo.EnsureAdminUser
   100  	newSingularRunner        = singular.New
   101  	peergrouperNew           = peergrouper.New
   102  	newNetworker             = networker.NewNetworker
   103  	newFirewaller            = firewaller.NewFirewaller
   104  	newDiskManager           = diskmanager.NewWorker
   105  	newCertificateUpdater    = certupdater.NewCertificateUpdater
   106  	reportOpenedState        = func(interface{}) {}
   107  	reportOpenedAPI          = func(interface{}) {}
   108  	getMetricAPI             = metricAPI
   109  )
   110  
   111  func init() {
   112  	stateWorkerDialOpts = mongo.DefaultDialOpts()
   113  	stateWorkerDialOpts.PostDial = func(session *mgo.Session) error {
   114  		safe := mgo.Safe{
   115  			// Wait for group commit if journaling is enabled,
   116  			// which is always true in production.
   117  			J: true,
   118  		}
   119  		_, err := replicaset.CurrentConfig(session)
   120  		if err == nil {
   121  			// set mongo to write-majority (writes only returned after
   122  			// replicated to a majority of replica-set members).
   123  			safe.WMode = "majority"
   124  		}
   125  		session.SetSafe(&safe)
   126  		return nil
   127  	}
   128  }
   129  
   130  // AgentInitializer handles initializing a type for use as a Jujud
   131  // agent.
   132  type AgentInitializer interface {
   133  	AddFlags(*gnuflag.FlagSet)
   134  	CheckArgs([]string) error
   135  }
   136  
   137  // AgentConfigWriter encapsulates disk I/O operations with the agent
   138  // config.
   139  type AgentConfigWriter interface {
   140  	// ReadConfig reads the config for the given tag from disk.
   141  	ReadConfig(tag string) error
   142  	// ChangeConfig executes the given AgentConfigMutator in a
   143  	// thread-safe context.
   144  	ChangeConfig(AgentConfigMutator) error
   145  	// CurrentConfig returns a copy of the in-memory agent config.
   146  	CurrentConfig() agent.Config
   147  }
   148  
   149  // NewMachineAgentCmd creates a Command which handles parsing
   150  // command-line arguments and instantiating and running a
   151  // MachineAgent.
   152  func NewMachineAgentCmd(
   153  	machineAgentFactory func(string) *MachineAgent,
   154  	agentInitializer AgentInitializer,
   155  	configFetcher AgentConfigWriter,
   156  ) cmd.Command {
   157  	return &machineAgentCmd{
   158  		machineAgentFactory: machineAgentFactory,
   159  		agentInitializer:    agentInitializer,
   160  		currentConfig:       configFetcher,
   161  	}
   162  }
   163  
   164  type machineAgentCmd struct {
   165  	cmd.CommandBase
   166  
   167  	// This group of arguments is required.
   168  	agentInitializer    AgentInitializer
   169  	currentConfig       AgentConfigWriter
   170  	machineAgentFactory func(string) *MachineAgent
   171  
   172  	// This group is for debugging purposes.
   173  	logToStdErr bool
   174  
   175  	// The following are set via command-line flags.
   176  	machineId string
   177  }
   178  
   179  // Init is called by the cmd system to initialize the structure for
   180  // running.
   181  func (a *machineAgentCmd) Init(args []string) error {
   182  
   183  	if !names.IsValidMachine(a.machineId) {
   184  		return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer")
   185  	}
   186  	if err := a.agentInitializer.CheckArgs(args); err != nil {
   187  		return err
   188  	}
   189  
   190  	// Due to changes in the logging, and needing to care about old
   191  	// environments that have been upgraded, we need to explicitly remove the
   192  	// file writer if one has been added, otherwise we will get duplicate
   193  	// lines of all logging in the log file.
   194  	loggo.RemoveWriter("logfile")
   195  
   196  	if a.logToStdErr {
   197  		return nil
   198  	}
   199  
   200  	err := a.currentConfig.ReadConfig(names.NewMachineTag(a.machineId).String())
   201  	if err != nil {
   202  		return errors.Annotate(err, "cannot read agent configuration")
   203  	}
   204  	agentConfig := a.currentConfig.CurrentConfig()
   205  	filename := filepath.Join(agentConfig.LogDir(), agentConfig.Tag().String()+".log")
   206  
   207  	log := &lumberjack.Logger{
   208  		Filename:   filename,
   209  		MaxSize:    300, // megabytes
   210  		MaxBackups: 2,
   211  	}
   212  
   213  	return cmdutil.SwitchProcessToRollingLogs(log)
   214  }
   215  
   216  // Run instantiates a MachineAgent and runs it.
   217  func (a *machineAgentCmd) Run(c *cmd.Context) error {
   218  	machineAgent := a.machineAgentFactory(a.machineId)
   219  	return machineAgent.Run(c)
   220  }
   221  
   222  // SetFlags adds the requisite flags to run this command.
   223  func (a *machineAgentCmd) SetFlags(f *gnuflag.FlagSet) {
   224  	a.agentInitializer.AddFlags(f)
   225  	f.StringVar(&a.machineId, "machine-id", "", "id of the machine to run")
   226  }
   227  
   228  // Info returns usage information for the command.
   229  func (a *machineAgentCmd) Info() *cmd.Info {
   230  	return &cmd.Info{
   231  		Name:    "machine",
   232  		Purpose: "run a juju machine agent",
   233  	}
   234  }
   235  
   236  // MachineAgentFactoryFn returns a function which instantiates a
   237  // MachineAgent given a machineId.
   238  func MachineAgentFactoryFn(
   239  	agentConfWriter AgentConfigWriter,
   240  	apiAddressSetter apiaddressupdater.APIAddressSetter,
   241  ) func(string) *MachineAgent {
   242  	return func(machineId string) *MachineAgent {
   243  		return NewMachineAgent(
   244  			machineId,
   245  			agentConfWriter,
   246  			apiAddressSetter,
   247  			NewUpgradeWorkerContext(),
   248  			worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant),
   249  		)
   250  	}
   251  }
   252  
   253  // NewMachineAgent instantiates a new MachineAgent.
   254  func NewMachineAgent(
   255  	machineId string,
   256  	agentConfWriter AgentConfigWriter,
   257  	apiAddressSetter apiaddressupdater.APIAddressSetter,
   258  	upgradeWorkerContext *upgradeWorkerContext,
   259  	runner worker.Runner,
   260  ) *MachineAgent {
   261  
   262  	return &MachineAgent{
   263  		machineId:            machineId,
   264  		AgentConfigWriter:    agentConfWriter,
   265  		apiAddressSetter:     apiAddressSetter,
   266  		workersStarted:       make(chan struct{}),
   267  		upgradeWorkerContext: upgradeWorkerContext,
   268  		runner:               runner,
   269  	}
   270  }
   271  
   272  // MachineAgent is responsible for tying together all functionality
   273  // needed to orchestarte a Jujud instance which controls a machine.
   274  type MachineAgent struct {
   275  	AgentConfigWriter
   276  
   277  	tomb                 tomb.Tomb
   278  	machineId            string
   279  	previousAgentVersion version.Number
   280  	apiAddressSetter     apiaddressupdater.APIAddressSetter
   281  	runner               worker.Runner
   282  	configChangedVal     voyeur.Value
   283  	upgradeWorkerContext *upgradeWorkerContext
   284  	restoreMode          bool
   285  	restoring            bool
   286  	workersStarted       chan struct{}
   287  
   288  	mongoInitMutex   sync.Mutex
   289  	mongoInitialized bool
   290  }
   291  
   292  // IsRestorePreparing returns bool representing if we are in restore mode
   293  // but not running restore.
   294  func (a *MachineAgent) IsRestorePreparing() bool {
   295  	return a.restoreMode && !a.restoring
   296  }
   297  
   298  // IsRestoreRunning returns bool representing if we are in restore mode
   299  // and running the actual restore process.
   300  func (a *MachineAgent) IsRestoreRunning() bool {
   301  	return a.restoring
   302  }
   303  
   304  // Wait waits for the machine agent to finish.
   305  func (a *MachineAgent) Wait() error {
   306  	return a.tomb.Wait()
   307  }
   308  
   309  // Stop stops the machine agent.
   310  func (a *MachineAgent) Stop() error {
   311  	a.runner.Kill()
   312  	return a.tomb.Wait()
   313  }
   314  
   315  // Dying returns the channel that can be used to see if the machine
   316  // agent is terminating.
   317  func (a *MachineAgent) Dying() <-chan struct{} {
   318  	return a.tomb.Dying()
   319  }
   320  
   321  // Run runs a machine agent.
   322  func (a *MachineAgent) Run(*cmd.Context) error {
   323  
   324  	defer a.tomb.Done()
   325  	if err := a.ReadConfig(a.Tag().String()); err != nil {
   326  		return fmt.Errorf("cannot read agent configuration: %v", err)
   327  	}
   328  	agentConfig := a.CurrentConfig()
   329  
   330  	logger.Infof("machine agent %v start (%s [%s])", a.Tag(), version.Current, runtime.Compiler)
   331  	if flags := featureflag.String(); flags != "" {
   332  		logger.Warningf("developer feature flags enabled: %s", flags)
   333  	}
   334  
   335  	if err := a.upgradeWorkerContext.InitializeUsingAgent(a); err != nil {
   336  		return errors.Annotate(err, "error during upgradeWorkerContext initialisation")
   337  	}
   338  	a.configChangedVal.Set(struct{}{})
   339  	a.previousAgentVersion = agentConfig.UpgradedToVersion()
   340  	network.InitializeFromConfig(agentConfig)
   341  	charm.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache")
   342  	if err := a.createJujuRun(agentConfig.DataDir()); err != nil {
   343  		return fmt.Errorf("cannot create juju run symlink: %v", err)
   344  	}
   345  	a.runner.StartWorker("api", a.APIWorker)
   346  	a.runner.StartWorker("statestarter", a.newStateStarterWorker)
   347  	a.runner.StartWorker("termination", func() (worker.Worker, error) {
   348  		return terminationworker.NewWorker(), nil
   349  	})
   350  	// At this point, all workers will have been configured to start
   351  	close(a.workersStarted)
   352  	err := a.runner.Wait()
   353  	switch err {
   354  	case worker.ErrTerminateAgent:
   355  		err = a.uninstallAgent(agentConfig)
   356  	case worker.ErrRebootMachine:
   357  		logger.Infof("Caught reboot error")
   358  		err = a.executeRebootOrShutdown(params.ShouldReboot)
   359  	case worker.ErrShutdownMachine:
   360  		logger.Infof("Caught shutdown error")
   361  		err = a.executeRebootOrShutdown(params.ShouldShutdown)
   362  	}
   363  	err = cmdutil.AgentDone(logger, err)
   364  	a.tomb.Kill(err)
   365  	return err
   366  }
   367  
   368  func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error {
   369  	agentCfg := a.CurrentConfig()
   370  	// At this stage, all API connections would have been closed
   371  	// We need to reopen the API to clear the reboot flag after
   372  	// scheduling the reboot. It may be cleaner to do this in the reboot
   373  	// worker, before returning the ErrRebootMachine.
   374  	st, _, err := OpenAPIState(agentCfg, a)
   375  	if err != nil {
   376  		logger.Infof("Reboot: Error connecting to state")
   377  		return errors.Trace(err)
   378  	}
   379  	// block until all units/containers are ready, and reboot/shutdown
   380  	finalize, err := reboot.NewRebootWaiter(st, agentCfg)
   381  	if err != nil {
   382  		return errors.Trace(err)
   383  	}
   384  
   385  	logger.Infof("Reboot: Executing reboot")
   386  	err = finalize.ExecuteReboot(action)
   387  	if err != nil {
   388  		logger.Infof("Reboot: Error executing reboot: %v", err)
   389  		return errors.Trace(err)
   390  	}
   391  	// On windows, the shutdown command is asynchronous. We return ErrRebootMachine
   392  	// so the agent will simply exit without error pending reboot/shutdown.
   393  	return worker.ErrRebootMachine
   394  }
   395  
   396  func (a *MachineAgent) ChangeConfig(mutate AgentConfigMutator) error {
   397  	err := a.AgentConfigWriter.ChangeConfig(mutate)
   398  	a.configChangedVal.Set(struct{}{})
   399  	if err != nil {
   400  		return errors.Trace(err)
   401  	}
   402  	return nil
   403  }
   404  
   405  // PrepareRestore will flag the agent to allow only a limited set
   406  // of commands defined in
   407  // "github.com/juju/juju/apiserver".allowedMethodsAboutToRestore
   408  // the most noteworthy is:
   409  // Backups.Restore: this will ensure that we can do all the file movements
   410  // required for restore and no one will do changes while we do that.
   411  // it will return error if the machine is already in this state.
   412  func (a *MachineAgent) PrepareRestore() error {
   413  	if a.restoreMode {
   414  		return errors.Errorf("already in restore mode")
   415  	}
   416  	a.restoreMode = true
   417  	return nil
   418  }
   419  
   420  // BeginRestore will flag the agent to disallow all commands since
   421  // restore should be running and therefore making changes that
   422  // would override anything done.
   423  func (a *MachineAgent) BeginRestore() error {
   424  	switch {
   425  	case !a.restoreMode:
   426  		return errors.Errorf("not in restore mode, cannot begin restoration")
   427  	case a.restoring:
   428  		return errors.Errorf("already restoring")
   429  	}
   430  	a.restoring = true
   431  	return nil
   432  }
   433  
   434  // newrestorestatewatcherworker will return a worker or err if there is a failure,
   435  // the worker takes care of watching the state of restoreInfo doc and put the
   436  // agent in the different restore modes.
   437  func (a *MachineAgent) newRestoreStateWatcherWorker(st *state.State) (worker.Worker, error) {
   438  	rWorker := func(stopch <-chan struct{}) error {
   439  		return a.restoreStateWatcher(st, stopch)
   440  	}
   441  	return worker.NewSimpleWorker(rWorker), nil
   442  }
   443  
   444  // restoreChanged will be called whenever restoreInfo doc changes signaling a new
   445  // step in the restore process.
   446  func (a *MachineAgent) restoreChanged(st *state.State) error {
   447  	rinfo, err := st.EnsureRestoreInfo()
   448  	if err != nil {
   449  		return errors.Annotate(err, "cannot read restore state")
   450  	}
   451  	switch rinfo.Status() {
   452  	case state.RestorePending:
   453  		a.PrepareRestore()
   454  	case state.RestoreInProgress:
   455  		a.BeginRestore()
   456  	}
   457  	return nil
   458  }
   459  
   460  // restoreStateWatcher watches for restoreInfo looking for changes in the restore process.
   461  func (a *MachineAgent) restoreStateWatcher(st *state.State, stopch <-chan struct{}) error {
   462  	restoreWatch := st.WatchRestoreInfoChanges()
   463  	defer func() {
   464  		restoreWatch.Kill()
   465  		restoreWatch.Wait()
   466  	}()
   467  
   468  	for {
   469  		select {
   470  		case <-restoreWatch.Changes():
   471  			if err := a.restoreChanged(st); err != nil {
   472  				return err
   473  			}
   474  		case <-stopch:
   475  			return nil
   476  		}
   477  	}
   478  }
   479  
   480  // newStateStarterWorker wraps stateStarter in a simple worker for use in
   481  // a.runner.StartWorker.
   482  func (a *MachineAgent) newStateStarterWorker() (worker.Worker, error) {
   483  	return worker.NewSimpleWorker(a.stateStarter), nil
   484  }
   485  
   486  // stateStarter watches for changes to the agent configuration, and
   487  // starts or stops the state worker as appropriate. We watch the agent
   488  // configuration because the agent configuration has all the details
   489  // that we need to start a state server, whether they have been cached
   490  // or read from the state.
   491  //
   492  // It will stop working as soon as stopch is closed.
   493  func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error {
   494  	confWatch := a.configChangedVal.Watch()
   495  	defer confWatch.Close()
   496  	watchCh := make(chan struct{})
   497  	go func() {
   498  		for confWatch.Next() {
   499  			watchCh <- struct{}{}
   500  		}
   501  	}()
   502  	for {
   503  		select {
   504  		case <-watchCh:
   505  			agentConfig := a.CurrentConfig()
   506  
   507  			// N.B. StartWorker and StopWorker are idempotent.
   508  			_, ok := agentConfig.StateServingInfo()
   509  			if ok {
   510  				a.runner.StartWorker("state", func() (worker.Worker, error) {
   511  					return a.StateWorker()
   512  				})
   513  			} else {
   514  				a.runner.StopWorker("state")
   515  			}
   516  		case <-stopch:
   517  			return nil
   518  		}
   519  	}
   520  }
   521  
   522  // APIWorker returns a Worker that connects to the API and starts any
   523  // workers that need an API connection.
   524  func (a *MachineAgent) APIWorker() (worker.Worker, error) {
   525  	agentConfig := a.CurrentConfig()
   526  	st, entity, err := OpenAPIState(agentConfig, a)
   527  	if err != nil {
   528  		return nil, err
   529  	}
   530  	reportOpenedAPI(st)
   531  
   532  	// Refresh the configuration, since it may have been updated after opening state.
   533  	agentConfig = a.CurrentConfig()
   534  	for _, job := range entity.Jobs() {
   535  		if job.NeedsState() {
   536  			info, err := st.Agent().StateServingInfo()
   537  			if err != nil {
   538  				return nil, fmt.Errorf("cannot get state serving info: %v", err)
   539  			}
   540  			err = a.ChangeConfig(func(config agent.ConfigSetter) error {
   541  				config.SetStateServingInfo(info)
   542  				return nil
   543  			})
   544  			if err != nil {
   545  				return nil, err
   546  			}
   547  			agentConfig = a.CurrentConfig()
   548  			break
   549  		}
   550  	}
   551  
   552  	// Before starting any workers, ensure we record the Juju version this machine
   553  	// agent is running.
   554  	currentTools := &coretools.Tools{Version: version.Current}
   555  	if err := st.Upgrader().SetVersion(agentConfig.Tag().String(), currentTools.Version); err != nil {
   556  		return nil, errors.Annotate(err, "cannot set machine agent version")
   557  	}
   558  
   559  	runner := newConnRunner(st)
   560  
   561  	// Run the upgrader and the upgrade-steps worker without waiting for
   562  	// the upgrade steps to complete.
   563  	runner.StartWorker("upgrader", func() (worker.Worker, error) {
   564  		return upgrader.NewUpgrader(
   565  			st.Upgrader(),
   566  			agentConfig,
   567  			a.previousAgentVersion,
   568  			a.upgradeWorkerContext.IsUpgradeRunning,
   569  		), nil
   570  	})
   571  	runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter(st, entity.Jobs()))
   572  
   573  	// All other workers must wait for the upgrade steps to complete before starting.
   574  	a.startWorkerAfterUpgrade(runner, "api-post-upgrade", func() (worker.Worker, error) {
   575  		return a.postUpgradeAPIWorker(st, agentConfig, entity)
   576  	})
   577  
   578  	return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker.
   579  }
   580  
   581  func (a *MachineAgent) postUpgradeAPIWorker(
   582  	st *api.State,
   583  	agentConfig agent.Config,
   584  	entity *apiagent.Entity,
   585  ) (worker.Worker, error) {
   586  
   587  	rsyslogMode := rsyslog.RsyslogModeForwarding
   588  	var err error
   589  	for _, job := range entity.Jobs() {
   590  		if job == multiwatcher.JobManageEnviron {
   591  			rsyslogMode = rsyslog.RsyslogModeAccumulate
   592  			break
   593  		}
   594  	}
   595  
   596  	runner := newConnRunner(st)
   597  	// TODO(fwereade): this is *still* a hideous layering violation, but at least
   598  	// it's confined to jujud rather than extending into the worker itself.
   599  	// Start this worker first to try and get proxy settings in place
   600  	// before we do anything else.
   601  	writeSystemFiles := shouldWriteProxyFiles(agentConfig)
   602  	runner.StartWorker("proxyupdater", func() (worker.Worker, error) {
   603  		return proxyupdater.New(st.Environment(), writeSystemFiles), nil
   604  	})
   605  
   606  	runner.StartWorker("machiner", func() (worker.Worker, error) {
   607  		return machiner.NewMachiner(st.Machiner(), agentConfig), nil
   608  	})
   609  	runner.StartWorker("reboot", func() (worker.Worker, error) {
   610  		reboot, err := st.Reboot()
   611  		if err != nil {
   612  			return nil, errors.Trace(err)
   613  		}
   614  		lock, err := cmdutil.HookExecutionLock(cmdutil.DataDir)
   615  		if err != nil {
   616  			return nil, errors.Trace(err)
   617  		}
   618  		return rebootworker.NewReboot(reboot, agentConfig, lock)
   619  	})
   620  	runner.StartWorker("apiaddressupdater", func() (worker.Worker, error) {
   621  		return apiaddressupdater.NewAPIAddressUpdater(st.Machiner(), a.apiAddressSetter), nil
   622  	})
   623  	runner.StartWorker("logger", func() (worker.Worker, error) {
   624  		return workerlogger.NewLogger(st.Logger(), agentConfig), nil
   625  	})
   626  
   627  	runner.StartWorker("rsyslog", func() (worker.Worker, error) {
   628  		return cmdutil.NewRsyslogConfigWorker(st.Rsyslog(), agentConfig, rsyslogMode)
   629  	})
   630  	// TODO(axw) stop checking feature flag once storage has graduated.
   631  	if featureflag.Enabled(storage.FeatureFlag) {
   632  		runner.StartWorker("diskmanager", func() (worker.Worker, error) {
   633  			api, err := st.DiskManager()
   634  			if err != nil {
   635  				return nil, errors.Trace(err)
   636  			}
   637  			return newDiskManager(diskmanager.DefaultListBlockDevices, api), nil
   638  		})
   639  	}
   640  
   641  	// Check if the network management is disabled.
   642  	envConfig, err := st.Environment().EnvironConfig()
   643  	if err != nil {
   644  		return nil, fmt.Errorf("cannot read environment config: %v", err)
   645  	}
   646  	disableNetworkManagement, _ := envConfig.DisableNetworkManagement()
   647  	if disableNetworkManagement {
   648  		logger.Infof("network management is disabled")
   649  	}
   650  
   651  	// Start networker depending on configuration and job.
   652  	intrusiveMode := false
   653  	for _, job := range entity.Jobs() {
   654  		if job == multiwatcher.JobManageNetworking {
   655  			intrusiveMode = true
   656  			break
   657  		}
   658  	}
   659  	intrusiveMode = intrusiveMode && !disableNetworkManagement
   660  	runner.StartWorker("networker", func() (worker.Worker, error) {
   661  		return newNetworker(st.Networker(), agentConfig, intrusiveMode, networker.DefaultConfigBaseDir)
   662  	})
   663  
   664  	// If not a local provider bootstrap machine, start the worker to
   665  	// manage SSH keys.
   666  	providerType := agentConfig.Value(agent.ProviderType)
   667  	if providerType != provider.Local || a.machineId != bootstrapMachineId {
   668  		runner.StartWorker("authenticationworker", func() (worker.Worker, error) {
   669  			return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil
   670  		})
   671  	}
   672  
   673  	// Perform the operations needed to set up hosting for containers.
   674  	if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil {
   675  		cause := errors.Cause(err)
   676  		if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent {
   677  			return nil, worker.ErrTerminateAgent
   678  		}
   679  		return nil, fmt.Errorf("setting up container support: %v", err)
   680  	}
   681  	for _, job := range entity.Jobs() {
   682  		switch job {
   683  		case multiwatcher.JobHostUnits:
   684  			runner.StartWorker("deployer", func() (worker.Worker, error) {
   685  				apiDeployer := st.Deployer()
   686  				context := newDeployContext(apiDeployer, agentConfig)
   687  				return deployer.NewDeployer(apiDeployer, context), nil
   688  			})
   689  		case multiwatcher.JobManageEnviron:
   690  			runner.StartWorker("identity-file-writer", func() (worker.Worker, error) {
   691  				inner := func(<-chan struct{}) error {
   692  					agentConfig := a.CurrentConfig()
   693  					return agent.WriteSystemIdentityFile(agentConfig)
   694  				}
   695  				return worker.NewSimpleWorker(inner), nil
   696  			})
   697  		case multiwatcher.JobManageStateDeprecated:
   698  			// Legacy environments may set this, but we ignore it.
   699  		default:
   700  			// TODO(dimitern): Once all workers moved over to using
   701  			// the API, report "unknown job type" here.
   702  		}
   703  	}
   704  
   705  	return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker.
   706  }
   707  
   708  func (a *MachineAgent) upgradeStepsWorkerStarter(
   709  	st *api.State,
   710  	jobs []multiwatcher.MachineJob,
   711  ) func() (worker.Worker, error) {
   712  	return func() (worker.Worker, error) {
   713  		return a.upgradeWorkerContext.Worker(a, st, jobs), nil
   714  	}
   715  }
   716  
   717  // shouldWriteProxyFiles returns true, unless the supplied conf identifies the
   718  // machine agent running directly on the host system in a local environment.
   719  var shouldWriteProxyFiles = func(conf agent.Config) bool {
   720  	if conf.Value(agent.ProviderType) != provider.Local {
   721  		return true
   722  	}
   723  	return conf.Tag() != names.NewMachineTag(bootstrapMachineId)
   724  }
   725  
   726  // setupContainerSupport determines what containers can be run on this machine and
   727  // initialises suitable infrastructure to support such containers.
   728  func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity, agentConfig agent.Config) error {
   729  	var supportedContainers []instance.ContainerType
   730  	// LXC containers are only supported on bare metal and fully virtualized linux systems
   731  	// Nested LXC containers and Windows machines cannot run LXC containers
   732  	supportsLXC, err := lxc.IsLXCSupported()
   733  	if err != nil {
   734  		logger.Warningf("no lxc containers possible: %v", err)
   735  	}
   736  	if err == nil && supportsLXC {
   737  		supportedContainers = append(supportedContainers, instance.LXC)
   738  	}
   739  
   740  	supportsKvm, err := kvm.IsKVMSupported()
   741  	if err != nil {
   742  		logger.Warningf("determining kvm support: %v\nno kvm containers possible", err)
   743  	}
   744  	if err == nil && supportsKvm {
   745  		supportedContainers = append(supportedContainers, instance.KVM)
   746  	}
   747  	return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig)
   748  }
   749  
   750  // updateSupportedContainers records in state that a machine can run the specified containers.
   751  // It starts a watcher and when a container of a given type is first added to the machine,
   752  // the watcher is killed, the machine is set up to be able to start containers of the given type,
   753  // and a suitable provisioner is started.
   754  func (a *MachineAgent) updateSupportedContainers(
   755  	runner worker.Runner,
   756  	st *api.State,
   757  	machineTag string,
   758  	containers []instance.ContainerType,
   759  	agentConfig agent.Config,
   760  ) error {
   761  	pr := st.Provisioner()
   762  	tag, err := names.ParseMachineTag(machineTag)
   763  	if err != nil {
   764  		return err
   765  	}
   766  	machine, err := pr.Machine(tag)
   767  	if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead {
   768  		return worker.ErrTerminateAgent
   769  	}
   770  	if err != nil {
   771  		return errors.Annotatef(err, "cannot load machine %s from state", tag)
   772  	}
   773  	if len(containers) == 0 {
   774  		if err := machine.SupportsNoContainers(); err != nil {
   775  			return errors.Annotatef(err, "clearing supported containers for %s", tag)
   776  		}
   777  		return nil
   778  	}
   779  	if err := machine.SetSupportedContainers(containers...); err != nil {
   780  		return errors.Annotatef(err, "setting supported containers for %s", tag)
   781  	}
   782  	initLock, err := cmdutil.HookExecutionLock(agentConfig.DataDir())
   783  	if err != nil {
   784  		return err
   785  	}
   786  	// Start the watcher to fire when a container is first requested on the machine.
   787  	envUUID, err := st.EnvironTag()
   788  	if err != nil {
   789  		return err
   790  	}
   791  	watcherName := fmt.Sprintf("%s-container-watcher", machine.Id())
   792  	// There may not be a CA certificate private key available, and without
   793  	// it we can't ensure that other Juju nodes can connect securely, so only
   794  	// use an image URL getter if there's a private key.
   795  	var imageURLGetter container.ImageURLGetter
   796  	if agentConfig.Value(agent.AllowsSecureConnection) == "true" {
   797  		imageURLGetter = container.NewImageURLGetter(st.Addr(), envUUID.Id(), []byte(agentConfig.CACert()))
   798  	}
   799  	params := provisioner.ContainerSetupParams{
   800  		Runner:              runner,
   801  		WorkerName:          watcherName,
   802  		SupportedContainers: containers,
   803  		ImageURLGetter:      imageURLGetter,
   804  		Machine:             machine,
   805  		Provisioner:         pr,
   806  		Config:              agentConfig,
   807  		InitLock:            initLock,
   808  	}
   809  	handler := provisioner.NewContainerSetupHandler(params)
   810  	a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) {
   811  		return worker.NewStringsWorker(handler), nil
   812  	})
   813  	return nil
   814  }
   815  
   816  // StateWorker returns a worker running all the workers that require
   817  // a *state.State connection.
   818  func (a *MachineAgent) StateWorker() (worker.Worker, error) {
   819  	agentConfig := a.CurrentConfig()
   820  
   821  	// Start MongoDB server and dial.
   822  	if err := a.ensureMongoServer(agentConfig); err != nil {
   823  		return nil, err
   824  	}
   825  	st, m, err := openState(agentConfig, stateWorkerDialOpts)
   826  	if err != nil {
   827  		return nil, err
   828  	}
   829  	reportOpenedState(st)
   830  
   831  	stor := statestorage.NewStorage(st.EnvironUUID(), st.MongoSession())
   832  	registerSimplestreamsDataSource(stor)
   833  
   834  	runner := newConnRunner(st)
   835  	singularRunner, err := newSingularStateRunner(runner, st, m)
   836  	if err != nil {
   837  		return nil, errors.Trace(err)
   838  	}
   839  
   840  	// Take advantage of special knowledge here in that we will only ever want
   841  	// the storage provider on one machine, and that is the "bootstrap" node.
   842  	providerType := agentConfig.Value(agent.ProviderType)
   843  	if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId {
   844  		a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) {
   845  			// TODO(axw) 2013-09-24 bug #1229507
   846  			// Make another job to enable storage.
   847  			// There's nothing special about this.
   848  			return localstorage.NewWorker(agentConfig), nil
   849  		})
   850  	}
   851  	for _, job := range m.Jobs() {
   852  		switch job {
   853  		case state.JobHostUnits:
   854  			// Implemented in APIWorker.
   855  		case state.JobManageEnviron:
   856  			useMultipleCPUs()
   857  			a.startWorkerAfterUpgrade(runner, "env worker manager", func() (worker.Worker, error) {
   858  				return envworkermanager.NewEnvWorkerManager(st, a.startEnvWorkers), nil
   859  			})
   860  			a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) {
   861  				return peergrouperNew(st)
   862  			})
   863  			a.startWorkerAfterUpgrade(runner, "restore", func() (worker.Worker, error) {
   864  				return a.newRestoreStateWatcherWorker(st)
   865  			})
   866  			a.startWorkerAfterUpgrade(runner, "lease manager", func() (worker.Worker, error) {
   867  				workerLoop := lease.WorkerLoop(st)
   868  				return worker.NewSimpleWorker(workerLoop), nil
   869  			})
   870  			certChangedChan := make(chan params.StateServingInfo, 1)
   871  			runner.StartWorker("apiserver", a.apiserverWorkerStarter(st, certChangedChan))
   872  			var stateServingSetter certupdater.StateServingInfoSetter = func(info params.StateServingInfo) error {
   873  				return a.ChangeConfig(func(config agent.ConfigSetter) error {
   874  					config.SetStateServingInfo(info)
   875  					logger.Infof("update apiserver worker with new certificate")
   876  					certChangedChan <- info
   877  					return nil
   878  				})
   879  			}
   880  			a.startWorkerAfterUpgrade(runner, "certupdater", func() (worker.Worker, error) {
   881  				return newCertificateUpdater(m, agentConfig, st, stateServingSetter, certChangedChan), nil
   882  			})
   883  			a.startWorkerAfterUpgrade(singularRunner, "resumer", func() (worker.Worker, error) {
   884  				// The action of resumer is so subtle that it is not tested,
   885  				// because we can't figure out how to do so without brutalising
   886  				// the transaction log.
   887  				return resumer.NewResumer(st), nil
   888  			})
   889  		case state.JobManageStateDeprecated:
   890  			// Legacy environments may set this, but we ignore it.
   891  		default:
   892  			logger.Warningf("ignoring unknown job %q", job)
   893  		}
   894  	}
   895  	return cmdutil.NewCloseWorker(logger, runner, st), nil
   896  }
   897  
   898  // startEnvWorkers starts state server workers that need to run per
   899  // environment.
   900  func (a *MachineAgent) startEnvWorkers(
   901  	ssSt envworkermanager.InitialState,
   902  	st *state.State,
   903  ) (runner worker.Runner, err error) {
   904  	envUUID := st.EnvironUUID()
   905  	defer errors.DeferredAnnotatef(&err, "failed to start workers for env %s", envUUID)
   906  	logger.Infof("starting workers for env %s", envUUID)
   907  
   908  	// Establish API connection for this environment.
   909  	agentConfig := a.CurrentConfig()
   910  	apiInfo := agentConfig.APIInfo()
   911  	apiInfo.EnvironTag = st.EnvironTag()
   912  	apiSt, err := OpenAPIStateUsingInfo(apiInfo, a, agentConfig.OldPassword())
   913  	if err != nil {
   914  		return nil, errors.Trace(err)
   915  	}
   916  
   917  	// Create a runner for workers specific to this
   918  	// environment. Either the State or API connection failing will be
   919  	// considered fatal, killing the runner and all its workers.
   920  	runner = newConnRunner(st, apiSt)
   921  	defer func() {
   922  		if err != nil && runner != nil {
   923  			runner.Kill()
   924  			runner.Wait()
   925  		}
   926  	}()
   927  	// Close the API connection when the runner for this environment dies.
   928  	go func() {
   929  		runner.Wait()
   930  		err := apiSt.Close()
   931  		if err != nil {
   932  			logger.Errorf("failed to close API connection for env %s: %v", envUUID, err)
   933  		}
   934  	}()
   935  
   936  	// Create a singular runner for this environment.
   937  	machine, err := ssSt.Machine(a.machineId)
   938  	if err != nil {
   939  		return nil, errors.Trace(err)
   940  	}
   941  	singularRunner, err := newSingularStateRunner(runner, ssSt, machine)
   942  	if err != nil {
   943  		return nil, errors.Trace(err)
   944  	}
   945  	defer func() {
   946  		if err != nil && singularRunner != nil {
   947  			singularRunner.Kill()
   948  			singularRunner.Wait()
   949  		}
   950  	}()
   951  
   952  	// Start workers that depend on a *state.State.
   953  	runner.StartWorker("instancepoller", func() (worker.Worker, error) {
   954  		return instancepoller.NewWorker(st), nil
   955  	})
   956  	singularRunner.StartWorker("cleaner", func() (worker.Worker, error) {
   957  		return cleaner.NewCleaner(st), nil
   958  	})
   959  	singularRunner.StartWorker("minunitsworker", func() (worker.Worker, error) {
   960  		return minunitsworker.NewMinUnitsWorker(st), nil
   961  	})
   962  
   963  	// Start workers that use an API connection.
   964  	singularRunner.StartWorker("environ-provisioner", func() (worker.Worker, error) {
   965  		return provisioner.NewEnvironProvisioner(apiSt.Provisioner(), agentConfig), nil
   966  	})
   967  	singularRunner.StartWorker("charm-revision-updater", func() (worker.Worker, error) {
   968  		return charmrevisionworker.NewRevisionUpdateWorker(apiSt.CharmRevisionUpdater()), nil
   969  	})
   970  	runner.StartWorker("metricmanagerworker", func() (worker.Worker, error) {
   971  		return metricworker.NewMetricsManager(getMetricAPI(apiSt))
   972  	})
   973  
   974  	// TODO(axw) 2013-09-24 bug #1229506
   975  	// Make another job to enable the firewaller. Not all
   976  	// environments are capable of managing ports
   977  	// centrally.
   978  	fwMode, err := getFirewallMode(apiSt)
   979  	if err != nil {
   980  		return nil, errors.Annotate(err, "cannot get firewall mode")
   981  	}
   982  	if fwMode != config.FwNone {
   983  		singularRunner.StartWorker("firewaller", func() (worker.Worker, error) {
   984  			return newFirewaller(apiSt.Firewaller())
   985  		})
   986  	} else {
   987  		logger.Debugf("not starting firewaller worker - firewall-mode is %q", fwMode)
   988  	}
   989  
   990  	return runner, nil
   991  }
   992  
   993  var getFirewallMode = _getFirewallMode
   994  
   995  func _getFirewallMode(apiSt *api.State) (string, error) {
   996  	envConfig, err := apiSt.Environment().EnvironConfig()
   997  	if err != nil {
   998  		return "", errors.Annotate(err, "cannot read environment config")
   999  	}
  1000  	return envConfig.FirewallMode(), nil
  1001  }
  1002  
  1003  // stateWorkerDialOpts is a mongo.DialOpts suitable
  1004  // for use by StateWorker to dial mongo.
  1005  //
  1006  // This must be overridden in tests, as it assumes
  1007  // journaling is enabled.
  1008  var stateWorkerDialOpts mongo.DialOpts
  1009  
  1010  func (a *MachineAgent) apiserverWorkerStarter(st *state.State, certChanged chan params.StateServingInfo) func() (worker.Worker, error) {
  1011  	return func() (worker.Worker, error) { return a.newApiserverWorker(st, certChanged) }
  1012  }
  1013  
  1014  func (a *MachineAgent) newApiserverWorker(st *state.State, certChanged chan params.StateServingInfo) (worker.Worker, error) {
  1015  	agentConfig := a.CurrentConfig()
  1016  	// If the configuration does not have the required information,
  1017  	// it is currently not a recoverable error, so we kill the whole
  1018  	// agent, potentially enabling human intervention to fix
  1019  	// the agent's configuration file.
  1020  	info, ok := agentConfig.StateServingInfo()
  1021  	if !ok {
  1022  		return nil, &cmdutil.FatalError{"StateServingInfo not available and we need it"}
  1023  	}
  1024  	cert := []byte(info.Cert)
  1025  	key := []byte(info.PrivateKey)
  1026  
  1027  	if len(cert) == 0 || len(key) == 0 {
  1028  		return nil, &cmdutil.FatalError{"configuration does not have state server cert/key"}
  1029  	}
  1030  	tag := agentConfig.Tag()
  1031  	dataDir := agentConfig.DataDir()
  1032  	logDir := agentConfig.LogDir()
  1033  
  1034  	endpoint := net.JoinHostPort("", strconv.Itoa(info.APIPort))
  1035  	listener, err := net.Listen("tcp", endpoint)
  1036  	if err != nil {
  1037  		return nil, err
  1038  	}
  1039  	return apiserver.NewServer(st, listener, apiserver.ServerConfig{
  1040  		Cert:        cert,
  1041  		Key:         key,
  1042  		Tag:         tag,
  1043  		DataDir:     dataDir,
  1044  		LogDir:      logDir,
  1045  		Validator:   a.limitLogins,
  1046  		CertChanged: certChanged,
  1047  	})
  1048  }
  1049  
  1050  // limitLogins is called by the API server for each login attempt.
  1051  // it returns an error if upgrads or restore are running.
  1052  func (a *MachineAgent) limitLogins(req params.LoginRequest) error {
  1053  	if err := a.limitLoginsDuringRestore(req); err != nil {
  1054  		return err
  1055  	}
  1056  	return a.limitLoginsDuringUpgrade(req)
  1057  }
  1058  
  1059  // limitLoginsDuringRestore will only allow logins for restore related purposes
  1060  // while the different steps of restore are running.
  1061  func (a *MachineAgent) limitLoginsDuringRestore(req params.LoginRequest) error {
  1062  	var err error
  1063  	switch {
  1064  	case a.IsRestoreRunning():
  1065  		err = apiserver.RestoreInProgressError
  1066  	case a.IsRestorePreparing():
  1067  		err = apiserver.AboutToRestoreError
  1068  	}
  1069  	if err != nil {
  1070  		authTag, parseErr := names.ParseTag(req.AuthTag)
  1071  		if parseErr != nil {
  1072  			return errors.Annotate(err, "could not parse auth tag")
  1073  		}
  1074  		switch authTag := authTag.(type) {
  1075  		case names.UserTag:
  1076  			// use a restricted API mode
  1077  			return err
  1078  		case names.MachineTag:
  1079  			if authTag == a.Tag() {
  1080  				// allow logins from the local machine
  1081  				return nil
  1082  			}
  1083  		}
  1084  		return errors.Errorf("login for %q blocked because restore is in progress", authTag)
  1085  	}
  1086  	return nil
  1087  }
  1088  
  1089  // limitLoginsDuringUpgrade is called by the API server for each login
  1090  // attempt. It returns an error if upgrades are in progress unless the
  1091  // login is for a user (i.e. a client) or the local machine.
  1092  func (a *MachineAgent) limitLoginsDuringUpgrade(req params.LoginRequest) error {
  1093  	if a.upgradeWorkerContext.IsUpgradeRunning() {
  1094  		authTag, err := names.ParseTag(req.AuthTag)
  1095  		if err != nil {
  1096  			return errors.Annotate(err, "could not parse auth tag")
  1097  		}
  1098  		switch authTag := authTag.(type) {
  1099  		case names.UserTag:
  1100  			// use a restricted API mode
  1101  			return apiserver.UpgradeInProgressError
  1102  		case names.MachineTag:
  1103  			if authTag == a.Tag() {
  1104  				// allow logins from the local machine
  1105  				return nil
  1106  			}
  1107  		}
  1108  		return errors.Errorf("login for %q blocked because upgrade is in progress", authTag)
  1109  	} else {
  1110  		return nil // allow all logins
  1111  	}
  1112  }
  1113  
  1114  // ensureMongoServer ensures that mongo is installed and running,
  1115  // and ready for opening a state connection.
  1116  func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) (err error) {
  1117  	a.mongoInitMutex.Lock()
  1118  	defer a.mongoInitMutex.Unlock()
  1119  	if a.mongoInitialized {
  1120  		logger.Debugf("mongo is already initialized")
  1121  		return nil
  1122  	}
  1123  	defer func() {
  1124  		if err == nil {
  1125  			a.mongoInitialized = true
  1126  		}
  1127  	}()
  1128  
  1129  	servingInfo, ok := agentConfig.StateServingInfo()
  1130  	if !ok {
  1131  		return fmt.Errorf("state worker was started with no state serving info")
  1132  	}
  1133  
  1134  	// When upgrading from a pre-HA-capable environment,
  1135  	// we must add machine-0 to the admin database and
  1136  	// initiate its replicaset.
  1137  	//
  1138  	// TODO(axw) remove this when we no longer need
  1139  	// to upgrade from pre-HA-capable environments.
  1140  	var shouldInitiateMongoServer bool
  1141  	var addrs []network.Address
  1142  	if isPreHAVersion(a.previousAgentVersion) {
  1143  		_, err := a.ensureMongoAdminUser(agentConfig)
  1144  		if err != nil {
  1145  			return err
  1146  		}
  1147  		if servingInfo.SharedSecret == "" {
  1148  			servingInfo.SharedSecret, err = mongo.GenerateSharedSecret()
  1149  			if err != nil {
  1150  				return err
  1151  			}
  1152  			if err = a.ChangeConfig(func(config agent.ConfigSetter) error {
  1153  				config.SetStateServingInfo(servingInfo)
  1154  				return nil
  1155  			}); err != nil {
  1156  				return err
  1157  			}
  1158  			agentConfig = a.CurrentConfig()
  1159  		}
  1160  		// Note: we set Direct=true in the mongo options because it's
  1161  		// possible that we've previously upgraded the mongo server's
  1162  		// configuration to form a replicaset, but failed to initiate it.
  1163  		st, m, err := openState(agentConfig, mongo.DialOpts{Direct: true})
  1164  		if err != nil {
  1165  			return err
  1166  		}
  1167  		ssi := cmdutil.ParamsStateServingInfoToStateStateServingInfo(servingInfo)
  1168  		if err := st.SetStateServingInfo(ssi); err != nil {
  1169  			st.Close()
  1170  			return fmt.Errorf("cannot set state serving info: %v", err)
  1171  		}
  1172  		st.Close()
  1173  		addrs = m.Addresses()
  1174  		shouldInitiateMongoServer = true
  1175  	}
  1176  
  1177  	// ensureMongoServer installs/upgrades the init config as necessary.
  1178  	ensureServerParams, err := cmdutil.NewEnsureServerParams(agentConfig)
  1179  	if err != nil {
  1180  		return err
  1181  	}
  1182  	if err := cmdutil.EnsureMongoServer(ensureServerParams); err != nil {
  1183  		return err
  1184  	}
  1185  	if !shouldInitiateMongoServer {
  1186  		return nil
  1187  	}
  1188  
  1189  	// Initiate the replicaset for upgraded environments.
  1190  	//
  1191  	// TODO(axw) remove this when we no longer need
  1192  	// to upgrade from pre-HA-capable environments.
  1193  	stateInfo, ok := agentConfig.MongoInfo()
  1194  	if !ok {
  1195  		return fmt.Errorf("state worker was started with no state serving info")
  1196  	}
  1197  	dialInfo, err := mongo.DialInfo(stateInfo.Info, mongo.DefaultDialOpts())
  1198  	if err != nil {
  1199  		return err
  1200  	}
  1201  	peerAddr := mongo.SelectPeerAddress(addrs)
  1202  	if peerAddr == "" {
  1203  		return fmt.Errorf("no appropriate peer address found in %q", addrs)
  1204  	}
  1205  	if err := maybeInitiateMongoServer(peergrouper.InitiateMongoParams{
  1206  		DialInfo:       dialInfo,
  1207  		MemberHostPort: net.JoinHostPort(peerAddr, fmt.Sprint(servingInfo.StatePort)),
  1208  		// TODO(dfc) InitiateMongoParams should take a Tag
  1209  		User:     stateInfo.Tag.String(),
  1210  		Password: stateInfo.Password,
  1211  	}); err != nil && err != peergrouper.ErrReplicaSetAlreadyInitiated {
  1212  		return err
  1213  	}
  1214  	return nil
  1215  }
  1216  
  1217  func (a *MachineAgent) ensureMongoAdminUser(agentConfig agent.Config) (added bool, err error) {
  1218  	stateInfo, ok1 := agentConfig.MongoInfo()
  1219  	servingInfo, ok2 := agentConfig.StateServingInfo()
  1220  	if !ok1 || !ok2 {
  1221  		return false, fmt.Errorf("no state serving info configuration")
  1222  	}
  1223  	dialInfo, err := mongo.DialInfo(stateInfo.Info, mongo.DefaultDialOpts())
  1224  	if err != nil {
  1225  		return false, err
  1226  	}
  1227  	if len(dialInfo.Addrs) > 1 {
  1228  		logger.Infof("more than one state server; admin user must exist")
  1229  		return false, nil
  1230  	}
  1231  	return ensureMongoAdminUser(mongo.EnsureAdminUserParams{
  1232  		DialInfo:  dialInfo,
  1233  		Namespace: agentConfig.Value(agent.Namespace),
  1234  		DataDir:   agentConfig.DataDir(),
  1235  		Port:      servingInfo.StatePort,
  1236  		User:      stateInfo.Tag.String(),
  1237  		Password:  stateInfo.Password,
  1238  	})
  1239  }
  1240  
  1241  func isPreHAVersion(v version.Number) bool {
  1242  	return v.Compare(version.MustParse("1.19.0")) < 0
  1243  }
  1244  
  1245  func openState(agentConfig agent.Config, dialOpts mongo.DialOpts) (_ *state.State, _ *state.Machine, err error) {
  1246  	info, ok := agentConfig.MongoInfo()
  1247  	if !ok {
  1248  		return nil, nil, fmt.Errorf("no state info available")
  1249  	}
  1250  	st, err := state.Open(info, dialOpts, environs.NewStatePolicy())
  1251  	if err != nil {
  1252  		return nil, nil, err
  1253  	}
  1254  	defer func() {
  1255  		if err != nil {
  1256  			st.Close()
  1257  		}
  1258  	}()
  1259  	m0, err := st.FindEntity(agentConfig.Tag())
  1260  	if err != nil {
  1261  		if errors.IsNotFound(err) {
  1262  			err = worker.ErrTerminateAgent
  1263  		}
  1264  		return nil, nil, err
  1265  	}
  1266  	m := m0.(*state.Machine)
  1267  	if m.Life() == state.Dead {
  1268  		return nil, nil, worker.ErrTerminateAgent
  1269  	}
  1270  	// Check the machine nonce as provisioned matches the agent.Conf value.
  1271  	if !m.CheckProvisioned(agentConfig.Nonce()) {
  1272  		// The agent is running on a different machine to the one it
  1273  		// should be according to state. It must stop immediately.
  1274  		logger.Errorf("running machine %v agent on inappropriate instance", m)
  1275  		return nil, nil, worker.ErrTerminateAgent
  1276  	}
  1277  	return st, m, nil
  1278  }
  1279  
  1280  // startWorkerAfterUpgrade starts a worker to run the specified child worker
  1281  // but only after waiting for upgrades to complete.
  1282  func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) {
  1283  	runner.StartWorker(name, func() (worker.Worker, error) {
  1284  		return a.upgradeWaiterWorker(start), nil
  1285  	})
  1286  }
  1287  
  1288  // upgradeWaiterWorker runs the specified worker after upgrades have completed.
  1289  func (a *MachineAgent) upgradeWaiterWorker(start func() (worker.Worker, error)) worker.Worker {
  1290  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
  1291  		// Wait for the upgrade to complete (or for us to be stopped).
  1292  		select {
  1293  		case <-stop:
  1294  			return nil
  1295  		case <-a.upgradeWorkerContext.UpgradeComplete:
  1296  		}
  1297  		// Upgrades are done, start the worker.
  1298  		worker, err := start()
  1299  		if err != nil {
  1300  			return err
  1301  		}
  1302  		// Wait for worker to finish or for us to be stopped.
  1303  		waitCh := make(chan error)
  1304  		go func() {
  1305  			waitCh <- worker.Wait()
  1306  		}()
  1307  		select {
  1308  		case err := <-waitCh:
  1309  			return err
  1310  		case <-stop:
  1311  			worker.Kill()
  1312  		}
  1313  		return <-waitCh // Ensure worker has stopped before returning.
  1314  	})
  1315  }
  1316  
  1317  func (a *MachineAgent) setMachineStatus(apiState *api.State, status params.Status, info string) error {
  1318  	tag := a.Tag().(names.MachineTag)
  1319  	machine, err := apiState.Machiner().Machine(tag)
  1320  	if err != nil {
  1321  		return errors.Trace(err)
  1322  	}
  1323  	if err := machine.SetStatus(status, info, nil); err != nil {
  1324  		return errors.Trace(err)
  1325  	}
  1326  	return nil
  1327  }
  1328  
  1329  // WorkersStarted returns a channel that's closed once all top level workers
  1330  // have been started. This is provided for testing purposes.
  1331  func (a *MachineAgent) WorkersStarted() <-chan struct{} {
  1332  	return a.workersStarted
  1333  }
  1334  
  1335  func (a *MachineAgent) Tag() names.Tag {
  1336  	return names.NewMachineTag(a.machineId)
  1337  }
  1338  
  1339  func (a *MachineAgent) createJujuRun(dataDir string) error {
  1340  	// TODO do not remove the symlink if it already points
  1341  	// to the right place.
  1342  	if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) {
  1343  		return err
  1344  	}
  1345  	jujud := filepath.Join(dataDir, "tools", a.Tag().String(), jujunames.Jujud)
  1346  	return symlink.New(jujud, JujuRun)
  1347  }
  1348  
  1349  func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error {
  1350  	var errors []error
  1351  	agentServiceName := agentConfig.Value(agent.AgentServiceName)
  1352  	if agentServiceName == "" {
  1353  		// For backwards compatibility, handle lack of AgentServiceName.
  1354  		agentServiceName = os.Getenv("UPSTART_JOB")
  1355  	}
  1356  	if agentServiceName != "" {
  1357  		if err := service.NewService(agentServiceName, common.Conf{}).Remove(); err != nil {
  1358  			errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err))
  1359  		}
  1360  	}
  1361  	// Remove the juju-run symlink.
  1362  	if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) {
  1363  		errors = append(errors, err)
  1364  	}
  1365  
  1366  	namespace := agentConfig.Value(agent.Namespace)
  1367  	if err := mongo.RemoveService(namespace); err != nil {
  1368  		errors = append(errors, fmt.Errorf("cannot stop/remove mongo service with namespace %q: %v", namespace, err))
  1369  	}
  1370  	if err := os.RemoveAll(agentConfig.DataDir()); err != nil {
  1371  		errors = append(errors, err)
  1372  	}
  1373  	if len(errors) == 0 {
  1374  		return nil
  1375  	}
  1376  	return fmt.Errorf("uninstall failed: %v", errors)
  1377  }
  1378  
  1379  func newConnRunner(conns ...cmdutil.Pinger) worker.Runner {
  1380  	return worker.NewRunner(cmdutil.ConnectionIsFatal(logger, conns...), cmdutil.MoreImportant)
  1381  }
  1382  
  1383  type MongoSessioner interface {
  1384  	MongoSession() *mgo.Session
  1385  }
  1386  
  1387  func newSingularStateRunner(runner worker.Runner, st MongoSessioner, m *state.Machine) (worker.Runner, error) {
  1388  	singularStateConn := singularStateConn{st.MongoSession(), m}
  1389  	singularRunner, err := newSingularRunner(runner, singularStateConn)
  1390  	if err != nil {
  1391  		return nil, errors.Annotate(err, "cannot make singular State Runner")
  1392  	}
  1393  	return singularRunner, err
  1394  }
  1395  
  1396  // singularStateConn implements singular.Conn on
  1397  // top of a State connection.
  1398  type singularStateConn struct {
  1399  	session *mgo.Session
  1400  	machine *state.Machine
  1401  }
  1402  
  1403  func (c singularStateConn) IsMaster() (bool, error) {
  1404  	return mongo.IsMaster(c.session, c.machine)
  1405  }
  1406  
  1407  func (c singularStateConn) Ping() error {
  1408  	return c.session.Ping()
  1409  }
  1410  
  1411  func metricAPI(st *api.State) metricsmanager.MetricsManagerClient {
  1412  	return metricsmanager.NewClient(st)
  1413  }
  1414  
  1415  // newDeployContext gives the tests the opportunity to create a deployer.Context
  1416  // that can be used for testing so as to avoid (1) deploying units to the system
  1417  // running the tests and (2) get access to the *State used internally, so that
  1418  // tests can be run without waiting for the 5s watcher refresh time to which we would
  1419  // otherwise be restricted.
  1420  var newDeployContext = func(st *apideployer.State, agentConfig agent.Config) deployer.Context {
  1421  	return deployer.NewSimpleContext(agentConfig, st)
  1422  }