github.com/rogpeppe/juju@v0.0.0-20140613142852-6337964b789e/cmd/jujud/machine.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package main
     5  
     6  import (
     7  	"fmt"
     8  	"net"
     9  	"os"
    10  	"path/filepath"
    11  	"runtime"
    12  	"time"
    13  
    14  	"github.com/juju/charm"
    15  	"github.com/juju/cmd"
    16  	"github.com/juju/errors"
    17  	"github.com/juju/loggo"
    18  	"github.com/juju/names"
    19  	"github.com/juju/utils"
    20  	"github.com/juju/utils/voyeur"
    21  	"labix.org/v2/mgo"
    22  	"launchpad.net/gnuflag"
    23  	"launchpad.net/tomb"
    24  
    25  	"github.com/juju/juju/agent"
    26  	"github.com/juju/juju/container/kvm"
    27  	"github.com/juju/juju/environs"
    28  	"github.com/juju/juju/instance"
    29  	"github.com/juju/juju/mongo"
    30  	"github.com/juju/juju/network"
    31  	"github.com/juju/juju/provider"
    32  	"github.com/juju/juju/state"
    33  	"github.com/juju/juju/state/api"
    34  	apiagent "github.com/juju/juju/state/api/agent"
    35  	"github.com/juju/juju/state/api/params"
    36  	"github.com/juju/juju/state/apiserver"
    37  	"github.com/juju/juju/upgrades"
    38  	"github.com/juju/juju/upstart"
    39  	"github.com/juju/juju/version"
    40  	"github.com/juju/juju/worker"
    41  	"github.com/juju/juju/worker/apiaddressupdater"
    42  	"github.com/juju/juju/worker/authenticationworker"
    43  	"github.com/juju/juju/worker/charmrevisionworker"
    44  	"github.com/juju/juju/worker/cleaner"
    45  	"github.com/juju/juju/worker/deployer"
    46  	"github.com/juju/juju/worker/firewaller"
    47  	"github.com/juju/juju/worker/instancepoller"
    48  	"github.com/juju/juju/worker/localstorage"
    49  	workerlogger "github.com/juju/juju/worker/logger"
    50  	"github.com/juju/juju/worker/machineenvironmentworker"
    51  	"github.com/juju/juju/worker/machiner"
    52  	"github.com/juju/juju/worker/minunitsworker"
    53  	"github.com/juju/juju/worker/peergrouper"
    54  	"github.com/juju/juju/worker/provisioner"
    55  	"github.com/juju/juju/worker/resumer"
    56  	"github.com/juju/juju/worker/rsyslog"
    57  	"github.com/juju/juju/worker/singular"
    58  	"github.com/juju/juju/worker/terminationworker"
    59  	"github.com/juju/juju/worker/upgrader"
    60  )
    61  
    62  var logger = loggo.GetLogger("juju.cmd.jujud")
    63  
    64  var newRunner = worker.NewRunner
    65  
    66  const bootstrapMachineId = "0"
    67  
    68  // eitherState can be either a *state.State or a *api.State.
    69  type eitherState interface{}
    70  
    71  var (
    72  	retryDelay      = 3 * time.Second
    73  	jujuRun         = "/usr/local/bin/juju-run"
    74  	useMultipleCPUs = utils.UseMultipleCPUs
    75  
    76  	// The following are defined as variables to
    77  	// allow the tests to intercept calls to the functions.
    78  	ensureMongoServer        = mongo.EnsureServer
    79  	maybeInitiateMongoServer = peergrouper.MaybeInitiateMongoServer
    80  	ensureMongoAdminUser     = mongo.EnsureAdminUser
    81  	newSingularRunner        = singular.New
    82  	peergrouperNew           = peergrouper.New
    83  
    84  	// reportOpenedAPI is exposed for tests to know when
    85  	// the State has been successfully opened.
    86  	reportOpenedState = func(eitherState) {}
    87  
    88  	// reportOpenedAPI is exposed for tests to know when
    89  	// the API has been successfully opened.
    90  	reportOpenedAPI = func(eitherState) {}
    91  )
    92  
    93  // MachineAgent is a cmd.Command responsible for running a machine agent.
    94  type MachineAgent struct {
    95  	cmd.CommandBase
    96  	tomb tomb.Tomb
    97  	AgentConf
    98  	MachineId        string
    99  	runner           worker.Runner
   100  	configChangedVal voyeur.Value
   101  	upgradeComplete  chan struct{}
   102  	workersStarted   chan struct{}
   103  	st               *state.State
   104  }
   105  
   106  // Info returns usage information for the command.
   107  func (a *MachineAgent) Info() *cmd.Info {
   108  	return &cmd.Info{
   109  		Name:    "machine",
   110  		Purpose: "run a juju machine agent",
   111  	}
   112  }
   113  
   114  func (a *MachineAgent) SetFlags(f *gnuflag.FlagSet) {
   115  	a.AgentConf.AddFlags(f)
   116  	f.StringVar(&a.MachineId, "machine-id", "", "id of the machine to run")
   117  }
   118  
   119  // Init initializes the command for running.
   120  func (a *MachineAgent) Init(args []string) error {
   121  	if !names.IsMachine(a.MachineId) {
   122  		return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer")
   123  	}
   124  	if err := a.AgentConf.CheckArgs(args); err != nil {
   125  		return err
   126  	}
   127  	a.runner = newRunner(isFatal, moreImportant)
   128  	a.upgradeComplete = make(chan struct{})
   129  	a.workersStarted = make(chan struct{})
   130  	return nil
   131  }
   132  
   133  // Wait waits for the machine agent to finish.
   134  func (a *MachineAgent) Wait() error {
   135  	return a.tomb.Wait()
   136  }
   137  
   138  // Stop stops the machine agent.
   139  func (a *MachineAgent) Stop() error {
   140  	a.runner.Kill()
   141  	return a.tomb.Wait()
   142  }
   143  
   144  // Run runs a machine agent.
   145  func (a *MachineAgent) Run(_ *cmd.Context) error {
   146  	// Due to changes in the logging, and needing to care about old
   147  	// environments that have been upgraded, we need to explicitly remove the
   148  	// file writer if one has been added, otherwise we will get duplicate
   149  	// lines of all logging in the log file.
   150  	loggo.RemoveWriter("logfile")
   151  	defer a.tomb.Done()
   152  	logger.Infof("machine agent %v start (%s [%s])", a.Tag(), version.Current, runtime.Compiler)
   153  	if err := a.ReadConfig(a.Tag()); err != nil {
   154  		return fmt.Errorf("cannot read agent configuration: %v", err)
   155  	}
   156  	a.configChangedVal.Set(struct{}{})
   157  	agentConfig := a.CurrentConfig()
   158  	charm.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache")
   159  	if err := a.createJujuRun(agentConfig.DataDir()); err != nil {
   160  		return fmt.Errorf("cannot create juju run symlink: %v", err)
   161  	}
   162  	a.runner.StartWorker("api", a.APIWorker)
   163  	a.runner.StartWorker("statestarter", a.newStateStarterWorker)
   164  	a.runner.StartWorker("termination", func() (worker.Worker, error) {
   165  		return terminationworker.NewWorker(), nil
   166  	})
   167  	// At this point, all workers will have been configured to start
   168  	close(a.workersStarted)
   169  	err := a.runner.Wait()
   170  	if err == worker.ErrTerminateAgent {
   171  		err = a.uninstallAgent(agentConfig)
   172  	}
   173  	err = agentDone(err)
   174  	a.tomb.Kill(err)
   175  	return err
   176  }
   177  
   178  func (a *MachineAgent) ChangeConfig(mutate func(config agent.ConfigSetter)) error {
   179  	err := a.AgentConf.ChangeConfig(mutate)
   180  	a.configChangedVal.Set(struct{}{})
   181  	return err
   182  }
   183  
   184  // newStateStarterWorker wraps stateStarter in a simple worker for use in
   185  // a.runner.StartWorker.
   186  func (a *MachineAgent) newStateStarterWorker() (worker.Worker, error) {
   187  	return worker.NewSimpleWorker(a.stateStarter), nil
   188  }
   189  
   190  // stateStarter watches for changes to the agent configuration, and
   191  // starts or stops the state worker as appropriate. We watch the agent
   192  // configuration because the agent configuration has all the details
   193  // that we need to start a state server, whether they have been cached
   194  // or read from the state.
   195  //
   196  // It will stop working as soon as stopch is closed.
   197  func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error {
   198  	confWatch := a.configChangedVal.Watch()
   199  	defer confWatch.Close()
   200  	watchCh := make(chan struct{})
   201  	go func() {
   202  		for confWatch.Next() {
   203  			watchCh <- struct{}{}
   204  		}
   205  	}()
   206  	for {
   207  		select {
   208  		case <-watchCh:
   209  			agentConfig := a.CurrentConfig()
   210  
   211  			// N.B. StartWorker and StopWorker are idempotent.
   212  			_, ok := agentConfig.StateServingInfo()
   213  			if ok {
   214  				a.runner.StartWorker("state", func() (worker.Worker, error) {
   215  					return a.StateWorker()
   216  				})
   217  			} else {
   218  				a.runner.StopWorker("state")
   219  			}
   220  		case <-stopch:
   221  			return nil
   222  		}
   223  	}
   224  }
   225  
   226  // APIWorker returns a Worker that connects to the API and starts any
   227  // workers that need an API connection.
   228  func (a *MachineAgent) APIWorker() (worker.Worker, error) {
   229  	agentConfig := a.CurrentConfig()
   230  	st, entity, err := openAPIState(agentConfig, a)
   231  	if err != nil {
   232  		return nil, err
   233  	}
   234  	reportOpenedAPI(st)
   235  
   236  	// Refresh the configuration, since it may have been updated after opening state.
   237  	agentConfig = a.CurrentConfig()
   238  
   239  	for _, job := range entity.Jobs() {
   240  		if job.NeedsState() {
   241  			info, err := st.Agent().StateServingInfo()
   242  			if err != nil {
   243  				return nil, fmt.Errorf("cannot get state serving info: %v", err)
   244  			}
   245  			err = a.ChangeConfig(func(config agent.ConfigSetter) {
   246  				config.SetStateServingInfo(info)
   247  			})
   248  			if err != nil {
   249  				return nil, err
   250  			}
   251  			agentConfig = a.CurrentConfig()
   252  			break
   253  		}
   254  	}
   255  
   256  	rsyslogMode := rsyslog.RsyslogModeForwarding
   257  	runner := newRunner(connectionIsFatal(st), moreImportant)
   258  	var singularRunner worker.Runner
   259  	for _, job := range entity.Jobs() {
   260  		if job == params.JobManageEnviron {
   261  			rsyslogMode = rsyslog.RsyslogModeAccumulate
   262  			conn := singularAPIConn{st, st.Agent()}
   263  			singularRunner, err = newSingularRunner(runner, conn)
   264  			if err != nil {
   265  				return nil, fmt.Errorf("cannot make singular API Runner: %v", err)
   266  			}
   267  			break
   268  		}
   269  	}
   270  
   271  	// Run the upgrader and the upgrade-steps worker without waiting for
   272  	// the upgrade steps to complete.
   273  	runner.StartWorker("upgrader", func() (worker.Worker, error) {
   274  		return upgrader.NewUpgrader(st.Upgrader(), agentConfig), nil
   275  	})
   276  	runner.StartWorker("upgrade-steps", func() (worker.Worker, error) {
   277  		return a.upgradeWorker(st, entity.Jobs(), agentConfig), nil
   278  	})
   279  
   280  	// All other workers must wait for the upgrade steps to complete
   281  	// before starting.
   282  	a.startWorkerAfterUpgrade(runner, "machiner", func() (worker.Worker, error) {
   283  		return machiner.NewMachiner(st.Machiner(), agentConfig), nil
   284  	})
   285  	a.startWorkerAfterUpgrade(runner, "apiaddressupdater", func() (worker.Worker, error) {
   286  		return apiaddressupdater.NewAPIAddressUpdater(st.Machiner(), a), nil
   287  	})
   288  	a.startWorkerAfterUpgrade(runner, "logger", func() (worker.Worker, error) {
   289  		return workerlogger.NewLogger(st.Logger(), agentConfig), nil
   290  	})
   291  	a.startWorkerAfterUpgrade(runner, "machineenvironmentworker", func() (worker.Worker, error) {
   292  		return machineenvironmentworker.NewMachineEnvironmentWorker(st.Environment(), agentConfig), nil
   293  	})
   294  	a.startWorkerAfterUpgrade(runner, "rsyslog", func() (worker.Worker, error) {
   295  		return newRsyslogConfigWorker(st.Rsyslog(), agentConfig, rsyslogMode)
   296  	})
   297  
   298  	// If not a local provider bootstrap machine, start the worker to
   299  	// manage SSH keys.
   300  	providerType := agentConfig.Value(agent.ProviderType)
   301  	if providerType != provider.Local || a.MachineId != bootstrapMachineId {
   302  		a.startWorkerAfterUpgrade(runner, "authenticationworker", func() (worker.Worker, error) {
   303  			return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil
   304  		})
   305  	}
   306  
   307  	// Perform the operations needed to set up hosting for containers.
   308  	if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil {
   309  		return nil, fmt.Errorf("setting up container support: %v", err)
   310  	}
   311  	for _, job := range entity.Jobs() {
   312  		switch job {
   313  		case params.JobHostUnits:
   314  			a.startWorkerAfterUpgrade(runner, "deployer", func() (worker.Worker, error) {
   315  				apiDeployer := st.Deployer()
   316  				context := newDeployContext(apiDeployer, agentConfig)
   317  				return deployer.NewDeployer(apiDeployer, context), nil
   318  			})
   319  		case params.JobManageEnviron:
   320  			a.startWorkerAfterUpgrade(singularRunner, "environ-provisioner", func() (worker.Worker, error) {
   321  				return provisioner.NewEnvironProvisioner(st.Provisioner(), agentConfig), nil
   322  			})
   323  			// TODO(axw) 2013-09-24 bug #1229506
   324  			// Make another job to enable the firewaller. Not all
   325  			// environments are capable of managing ports
   326  			// centrally.
   327  			a.startWorkerAfterUpgrade(singularRunner, "firewaller", func() (worker.Worker, error) {
   328  				return firewaller.NewFirewaller(st.Firewaller())
   329  			})
   330  			a.startWorkerAfterUpgrade(singularRunner, "charm-revision-updater", func() (worker.Worker, error) {
   331  				return charmrevisionworker.NewRevisionUpdateWorker(st.CharmRevisionUpdater()), nil
   332  			})
   333  		case params.JobManageStateDeprecated:
   334  			// Legacy environments may set this, but we ignore it.
   335  		default:
   336  			// TODO(dimitern): Once all workers moved over to using
   337  			// the API, report "unknown job type" here.
   338  		}
   339  	}
   340  	return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker.
   341  }
   342  
   343  // setupContainerSupport determines what containers can be run on this machine and
   344  // initialises suitable infrastructure to support such containers.
   345  func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity, agentConfig agent.Config) error {
   346  	var supportedContainers []instance.ContainerType
   347  	// We don't yet support nested lxc containers but anything else can run an LXC container.
   348  	if entity.ContainerType() != instance.LXC {
   349  		supportedContainers = append(supportedContainers, instance.LXC)
   350  	}
   351  	supportsKvm, err := kvm.IsKVMSupported()
   352  	if err != nil {
   353  		logger.Warningf("determining kvm support: %v\nno kvm containers possible", err)
   354  	}
   355  	if err == nil && supportsKvm {
   356  		supportedContainers = append(supportedContainers, instance.KVM)
   357  	}
   358  	return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig)
   359  }
   360  
   361  // updateSupportedContainers records in state that a machine can run the specified containers.
   362  // It starts a watcher and when a container of a given type is first added to the machine,
   363  // the watcher is killed, the machine is set up to be able to start containers of the given type,
   364  // and a suitable provisioner is started.
   365  func (a *MachineAgent) updateSupportedContainers(
   366  	runner worker.Runner,
   367  	st *api.State,
   368  	tag string,
   369  	containers []instance.ContainerType,
   370  	agentConfig agent.Config,
   371  ) error {
   372  	pr := st.Provisioner()
   373  	machine, err := pr.Machine(tag)
   374  	if err != nil {
   375  		return fmt.Errorf("%s is not in state: %v", tag, err)
   376  	}
   377  	if len(containers) == 0 {
   378  		if err := machine.SupportsNoContainers(); err != nil {
   379  			return fmt.Errorf("clearing supported containers for %s: %v", tag, err)
   380  		}
   381  		return nil
   382  	}
   383  	if err := machine.SetSupportedContainers(containers...); err != nil {
   384  		return fmt.Errorf("setting supported containers for %s: %v", tag, err)
   385  	}
   386  	initLock, err := hookExecutionLock(agentConfig.DataDir())
   387  	if err != nil {
   388  		return err
   389  	}
   390  	// Start the watcher to fire when a container is first requested on the machine.
   391  	watcherName := fmt.Sprintf("%s-container-watcher", machine.Id())
   392  	handler := provisioner.NewContainerSetupHandler(
   393  		runner,
   394  		watcherName,
   395  		containers,
   396  		machine,
   397  		pr,
   398  		agentConfig,
   399  		initLock,
   400  	)
   401  	a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) {
   402  		return worker.NewStringsWorker(handler), nil
   403  	})
   404  	return nil
   405  }
   406  
   407  // StateWorker returns a worker running all the workers that require
   408  // a *state.State connection.
   409  func (a *MachineAgent) StateWorker() (worker.Worker, error) {
   410  	agentConfig := a.CurrentConfig()
   411  
   412  	// Create system-identity file
   413  	if err := agent.WriteSystemIdentityFile(agentConfig); err != nil {
   414  		return nil, err
   415  	}
   416  
   417  	// Start MondoDB server
   418  	if err := a.ensureMongoServer(agentConfig); err != nil {
   419  		return nil, err
   420  	}
   421  	st, m, err := openState(agentConfig)
   422  	if err != nil {
   423  		return nil, err
   424  	}
   425  	reportOpenedState(st)
   426  
   427  	singularStateConn := singularStateConn{st.MongoSession(), m}
   428  	runner := newRunner(connectionIsFatal(st), moreImportant)
   429  	singularRunner, err := newSingularRunner(runner, singularStateConn)
   430  	if err != nil {
   431  		return nil, fmt.Errorf("cannot make singular State Runner: %v", err)
   432  	}
   433  
   434  	// Take advantage of special knowledge here in that we will only ever want
   435  	// the storage provider on one machine, and that is the "bootstrap" node.
   436  	providerType := agentConfig.Value(agent.ProviderType)
   437  	if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId {
   438  		a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) {
   439  			// TODO(axw) 2013-09-24 bug #1229507
   440  			// Make another job to enable storage.
   441  			// There's nothing special about this.
   442  			return localstorage.NewWorker(agentConfig), nil
   443  		})
   444  	}
   445  	for _, job := range m.Jobs() {
   446  		switch job {
   447  		case state.JobHostUnits:
   448  			// Implemented in APIWorker.
   449  		case state.JobManageEnviron:
   450  			useMultipleCPUs()
   451  			a.startWorkerAfterUpgrade(runner, "instancepoller", func() (worker.Worker, error) {
   452  				return instancepoller.NewWorker(st), nil
   453  			})
   454  			a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) {
   455  				return peergrouperNew(st)
   456  			})
   457  			runner.StartWorker("apiserver", func() (worker.Worker, error) {
   458  				// If the configuration does not have the required information,
   459  				// it is currently not a recoverable error, so we kill the whole
   460  				// agent, potentially enabling human intervention to fix
   461  				// the agent's configuration file. In the future, we may retrieve
   462  				// the state server certificate and key from the state, and
   463  				// this should then change.
   464  				info, ok := agentConfig.StateServingInfo()
   465  				if !ok {
   466  					return nil, &fatalError{"StateServingInfo not available and we need it"}
   467  				}
   468  				port := info.APIPort
   469  				cert := []byte(info.Cert)
   470  				key := []byte(info.PrivateKey)
   471  
   472  				if len(cert) == 0 || len(key) == 0 {
   473  					return nil, &fatalError{"configuration does not have state server cert/key"}
   474  				}
   475  				dataDir := agentConfig.DataDir()
   476  				logDir := agentConfig.LogDir()
   477  				return apiserver.NewServer(st, apiserver.ServerConfig{
   478  					Addr:    fmt.Sprintf(":%d", port),
   479  					Cert:    cert,
   480  					Key:     key,
   481  					DataDir: dataDir,
   482  					LogDir:  logDir,
   483  				})
   484  			})
   485  			a.startWorkerAfterUpgrade(singularRunner, "cleaner", func() (worker.Worker, error) {
   486  				return cleaner.NewCleaner(st), nil
   487  			})
   488  			a.startWorkerAfterUpgrade(singularRunner, "resumer", func() (worker.Worker, error) {
   489  				// The action of resumer is so subtle that it is not tested,
   490  				// because we can't figure out how to do so without brutalising
   491  				// the transaction log.
   492  				return resumer.NewResumer(st), nil
   493  			})
   494  			a.startWorkerAfterUpgrade(singularRunner, "minunitsworker", func() (worker.Worker, error) {
   495  				return minunitsworker.NewMinUnitsWorker(st), nil
   496  			})
   497  		case state.JobManageStateDeprecated:
   498  			// Legacy environments may set this, but we ignore it.
   499  		default:
   500  			logger.Warningf("ignoring unknown job %q", job)
   501  		}
   502  	}
   503  	return newCloseWorker(runner, st), nil
   504  }
   505  
   506  // ensureMongoServer ensures that mongo is installed and running,
   507  // and ready for opening a state connection.
   508  func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) error {
   509  	servingInfo, ok := agentConfig.StateServingInfo()
   510  	if !ok {
   511  		return fmt.Errorf("state worker was started with no state serving info")
   512  	}
   513  	namespace := agentConfig.Value(agent.Namespace)
   514  
   515  	// When upgrading from a pre-HA-capable environment,
   516  	// we must add machine-0 to the admin database and
   517  	// initiate its replicaset.
   518  	//
   519  	// TODO(axw) remove this when we no longer need
   520  	// to upgrade from pre-HA-capable environments.
   521  	var shouldInitiateMongoServer bool
   522  	var addrs []network.Address
   523  	if isPreHAVersion(agentConfig.UpgradedToVersion()) {
   524  		_, err := a.ensureMongoAdminUser(agentConfig)
   525  		if err != nil {
   526  			return err
   527  		}
   528  		if servingInfo.SharedSecret == "" {
   529  			servingInfo.SharedSecret, err = mongo.GenerateSharedSecret()
   530  			if err != nil {
   531  				return err
   532  			}
   533  			if err = a.ChangeConfig(func(config agent.ConfigSetter) {
   534  				config.SetStateServingInfo(servingInfo)
   535  			}); err != nil {
   536  				return err
   537  			}
   538  			agentConfig = a.CurrentConfig()
   539  		}
   540  		st, m, err := openState(agentConfig)
   541  		if err != nil {
   542  			return err
   543  		}
   544  		if err := st.SetStateServingInfo(servingInfo); err != nil {
   545  			st.Close()
   546  			return fmt.Errorf("cannot set state serving info: %v", err)
   547  		}
   548  		st.Close()
   549  		addrs = m.Addresses()
   550  		shouldInitiateMongoServer = true
   551  	}
   552  
   553  	// ensureMongoServer installs/upgrades the upstart config as necessary.
   554  	if err := ensureMongoServer(
   555  		agentConfig.DataDir(),
   556  		namespace,
   557  		servingInfo,
   558  	); err != nil {
   559  		return err
   560  	}
   561  	if !shouldInitiateMongoServer {
   562  		return nil
   563  	}
   564  
   565  	// Initiate the replicaset for upgraded environments.
   566  	//
   567  	// TODO(axw) remove this when we no longer need
   568  	// to upgrade from pre-HA-capable environments.
   569  	stateInfo, ok := agentConfig.StateInfo()
   570  	if !ok {
   571  		return fmt.Errorf("state worker was started with no state serving info")
   572  	}
   573  	dialInfo, err := mongo.DialInfo(stateInfo.Info, mongo.DefaultDialOpts())
   574  	if err != nil {
   575  		return err
   576  	}
   577  	peerAddr := mongo.SelectPeerAddress(addrs)
   578  	if peerAddr == "" {
   579  		return fmt.Errorf("no appropriate peer address found in %q", addrs)
   580  	}
   581  	return maybeInitiateMongoServer(peergrouper.InitiateMongoParams{
   582  		DialInfo:       dialInfo,
   583  		MemberHostPort: net.JoinHostPort(peerAddr, fmt.Sprint(servingInfo.StatePort)),
   584  		User:           stateInfo.Tag,
   585  		Password:       stateInfo.Password,
   586  	})
   587  }
   588  
   589  func (a *MachineAgent) ensureMongoAdminUser(agentConfig agent.Config) (added bool, err error) {
   590  	stateInfo, ok1 := agentConfig.StateInfo()
   591  	servingInfo, ok2 := agentConfig.StateServingInfo()
   592  	if !ok1 || !ok2 {
   593  		return false, fmt.Errorf("no state serving info configuration")
   594  	}
   595  	dialInfo, err := mongo.DialInfo(stateInfo.Info, mongo.DefaultDialOpts())
   596  	if err != nil {
   597  		return false, err
   598  	}
   599  	if len(dialInfo.Addrs) > 1 {
   600  		logger.Infof("more than one state server; admin user must exist")
   601  		return false, nil
   602  	}
   603  	return ensureMongoAdminUser(mongo.EnsureAdminUserParams{
   604  		DialInfo:  dialInfo,
   605  		Namespace: agentConfig.Value(agent.Namespace),
   606  		DataDir:   agentConfig.DataDir(),
   607  		Port:      servingInfo.StatePort,
   608  		User:      stateInfo.Tag,
   609  		Password:  stateInfo.Password,
   610  	})
   611  }
   612  
   613  func isPreHAVersion(v version.Number) bool {
   614  	return v.Compare(version.MustParse("1.19.0")) < 0
   615  }
   616  
   617  func openState(agentConfig agent.Config) (_ *state.State, _ *state.Machine, err error) {
   618  	info, ok := agentConfig.StateInfo()
   619  	if !ok {
   620  		return nil, nil, fmt.Errorf("no state info available")
   621  	}
   622  	st, err := state.Open(info, mongo.DialOpts{}, environs.NewStatePolicy())
   623  	if err != nil {
   624  		return nil, nil, err
   625  	}
   626  	defer func() {
   627  		if err != nil {
   628  			st.Close()
   629  		}
   630  	}()
   631  	m0, err := st.FindEntity(agentConfig.Tag())
   632  	if err != nil {
   633  		if errors.IsNotFound(err) {
   634  			err = worker.ErrTerminateAgent
   635  		}
   636  		return nil, nil, err
   637  	}
   638  	m := m0.(*state.Machine)
   639  	if m.Life() == state.Dead {
   640  		return nil, nil, worker.ErrTerminateAgent
   641  	}
   642  	// Check the machine nonce as provisioned matches the agent.Conf value.
   643  	if !m.CheckProvisioned(agentConfig.Nonce()) {
   644  		// The agent is running on a different machine to the one it
   645  		// should be according to state. It must stop immediately.
   646  		logger.Errorf("running machine %v agent on inappropriate instance", m)
   647  		return nil, nil, worker.ErrTerminateAgent
   648  	}
   649  	return st, m, nil
   650  }
   651  
   652  // startWorkerAfterUpgrade starts a worker to run the specified child worker
   653  // but only after waiting for upgrades to complete.
   654  func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) {
   655  	runner.StartWorker(name, func() (worker.Worker, error) {
   656  		return a.upgradeWaiterWorker(start), nil
   657  	})
   658  }
   659  
   660  // upgradeWaiterWorker runs the specified worker after upgrades have completed.
   661  func (a *MachineAgent) upgradeWaiterWorker(start func() (worker.Worker, error)) worker.Worker {
   662  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   663  		// Wait for the upgrade to complete (or for us to be stopped).
   664  		select {
   665  		case <-stop:
   666  			return nil
   667  		case <-a.upgradeComplete:
   668  		}
   669  		// Upgrades are done, start the worker.
   670  		worker, err := start()
   671  		if err != nil {
   672  			return err
   673  		}
   674  		// Wait for worker to finish or for us to be stopped.
   675  		waitCh := make(chan error)
   676  		go func() {
   677  			waitCh <- worker.Wait()
   678  		}()
   679  		select {
   680  		case err := <-waitCh:
   681  			return err
   682  		case <-stop:
   683  			worker.Kill()
   684  		}
   685  		return <-waitCh // Ensure worker has stopped before returning.
   686  	})
   687  }
   688  
   689  // upgradeWorker runs the required upgrade operations to upgrade to the current Juju version.
   690  func (a *MachineAgent) upgradeWorker(
   691  	apiState *api.State,
   692  	jobs []params.MachineJob,
   693  	agentConfig agent.Config,
   694  ) worker.Worker {
   695  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   696  		select {
   697  		case <-a.upgradeComplete:
   698  			// Our work is already done (we're probably being restarted
   699  			// because the API connection has gone down), so do nothing.
   700  			<-stop
   701  			return nil
   702  		default:
   703  		}
   704  		// If the machine agent is a state server, flag that state
   705  		// needs to be opened before running upgrade steps
   706  		needsState := false
   707  		for _, job := range jobs {
   708  			if job == params.JobManageEnviron {
   709  				needsState = true
   710  			}
   711  		}
   712  		// We need a *state.State for upgrades. We open it independently
   713  		// of StateWorker, because we have no guarantees about when
   714  		// and how often StateWorker might run.
   715  		var st *state.State
   716  		if needsState {
   717  			var err error
   718  			info, ok := agentConfig.StateInfo()
   719  			if !ok {
   720  				return fmt.Errorf("no state info available")
   721  			}
   722  			st, err = state.Open(info, mongo.DialOpts{}, environs.NewStatePolicy())
   723  			if err != nil {
   724  				return err
   725  			}
   726  			defer st.Close()
   727  		}
   728  		err := a.runUpgrades(st, apiState, jobs, agentConfig)
   729  		if err != nil {
   730  			return err
   731  		}
   732  		logger.Infof("upgrade to %v completed.", version.Current)
   733  		close(a.upgradeComplete)
   734  		<-stop
   735  		return nil
   736  	})
   737  }
   738  
   739  // runUpgrades runs the upgrade operations for each job type and updates the updatedToVersion on success.
   740  func (a *MachineAgent) runUpgrades(
   741  	st *state.State,
   742  	apiState *api.State,
   743  	jobs []params.MachineJob,
   744  	agentConfig agent.Config,
   745  ) error {
   746  	from := version.Current
   747  	from.Number = agentConfig.UpgradedToVersion()
   748  	if from == version.Current {
   749  		logger.Infof("upgrade to %v already completed.", version.Current)
   750  		return nil
   751  	}
   752  	var err error
   753  	writeErr := a.ChangeConfig(func(agentConfig agent.ConfigSetter) {
   754  		context := upgrades.NewContext(agentConfig, apiState, st)
   755  		for _, job := range jobs {
   756  			target := upgradeTarget(job)
   757  			if target == "" {
   758  				continue
   759  			}
   760  			logger.Infof("starting upgrade from %v to %v for %v %q", from, version.Current, target, a.Tag())
   761  			if err = upgrades.PerformUpgrade(from.Number, target, context); err != nil {
   762  				err = fmt.Errorf("cannot perform upgrade from %v to %v for %v %q: %v", from, version.Current, target, a.Tag(), err)
   763  				return
   764  			}
   765  		}
   766  		agentConfig.SetUpgradedToVersion(version.Current.Number)
   767  	})
   768  	if writeErr != nil {
   769  		return fmt.Errorf("cannot write updated agent configuration: %v", writeErr)
   770  	}
   771  	return nil
   772  }
   773  
   774  func upgradeTarget(job params.MachineJob) upgrades.Target {
   775  	switch job {
   776  	case params.JobManageEnviron:
   777  		return upgrades.StateServer
   778  	case params.JobHostUnits:
   779  		return upgrades.HostMachine
   780  	}
   781  	return ""
   782  }
   783  
   784  // WorkersStarted returns a channel that's closed once all top level workers
   785  // have been started. This is provided for testing purposes.
   786  func (a *MachineAgent) WorkersStarted() <-chan struct{} {
   787  	return a.workersStarted
   788  }
   789  
   790  func (a *MachineAgent) Tag() string {
   791  	return names.NewMachineTag(a.MachineId).String()
   792  }
   793  
   794  func (a *MachineAgent) createJujuRun(dataDir string) error {
   795  	// TODO do not remove the symlink if it already points
   796  	// to the right place.
   797  	if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) {
   798  		return err
   799  	}
   800  	jujud := filepath.Join(dataDir, "tools", a.Tag(), "jujud")
   801  	return os.Symlink(jujud, jujuRun)
   802  }
   803  
   804  func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error {
   805  	var errors []error
   806  	agentServiceName := agentConfig.Value(agent.AgentServiceName)
   807  	if agentServiceName == "" {
   808  		// For backwards compatibility, handle lack of AgentServiceName.
   809  		agentServiceName = os.Getenv("UPSTART_JOB")
   810  	}
   811  	if agentServiceName != "" {
   812  		if err := upstart.NewService(agentServiceName).Remove(); err != nil {
   813  			errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err))
   814  		}
   815  	}
   816  	// Remove the juju-run symlink.
   817  	if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) {
   818  		errors = append(errors, err)
   819  	}
   820  
   821  	namespace := agentConfig.Value(agent.Namespace)
   822  	if err := mongo.RemoveService(namespace); err != nil {
   823  		errors = append(errors, fmt.Errorf("cannot stop/remove mongo service with namespace %q: %v", namespace, err))
   824  	}
   825  	if err := os.RemoveAll(agentConfig.DataDir()); err != nil {
   826  		errors = append(errors, err)
   827  	}
   828  	if len(errors) == 0 {
   829  		return nil
   830  	}
   831  	return fmt.Errorf("uninstall failed: %v", errors)
   832  }
   833  
   834  // singularAPIConn implements singular.Conn on
   835  // top of an API connection.
   836  type singularAPIConn struct {
   837  	apiState   *api.State
   838  	agentState *apiagent.State
   839  }
   840  
   841  func (c singularAPIConn) IsMaster() (bool, error) {
   842  	return c.agentState.IsMaster()
   843  }
   844  
   845  func (c singularAPIConn) Ping() error {
   846  	return c.apiState.Ping()
   847  }
   848  
   849  // singularStateConn implements singular.Conn on
   850  // top of a State connection.
   851  type singularStateConn struct {
   852  	session *mgo.Session
   853  	machine *state.Machine
   854  }
   855  
   856  func (c singularStateConn) IsMaster() (bool, error) {
   857  	return mongo.IsMaster(c.session, c.machine)
   858  }
   859  
   860  func (c singularStateConn) Ping() error {
   861  	return c.session.Ping()
   862  }