github.com/makyo/juju@v0.0.0-20160425123129-2608902037e9/cmd/jujud/agent/machine.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package agent
     5  
     6  import (
     7  	"fmt"
     8  	"io"
     9  	"net"
    10  	"os"
    11  	"path/filepath"
    12  	"runtime"
    13  	"strconv"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/juju/cmd"
    18  	"github.com/juju/errors"
    19  	apiagent "github.com/juju/juju/api/agent"
    20  	apimachiner "github.com/juju/juju/api/machiner"
    21  	"github.com/juju/loggo"
    22  	"github.com/juju/names"
    23  	"github.com/juju/replicaset"
    24  	"github.com/juju/utils"
    25  	"github.com/juju/utils/clock"
    26  	"github.com/juju/utils/featureflag"
    27  	"github.com/juju/utils/series"
    28  	"github.com/juju/utils/set"
    29  	"github.com/juju/utils/symlink"
    30  	"github.com/juju/utils/voyeur"
    31  	"github.com/juju/version"
    32  	"gopkg.in/juju/charmrepo.v2-unstable"
    33  	"gopkg.in/mgo.v2"
    34  	"gopkg.in/natefinch/lumberjack.v2"
    35  	"launchpad.net/gnuflag"
    36  	"launchpad.net/tomb"
    37  
    38  	"github.com/juju/juju/agent"
    39  	"github.com/juju/juju/agent/tools"
    40  	"github.com/juju/juju/api"
    41  	apideployer "github.com/juju/juju/api/deployer"
    42  	"github.com/juju/juju/api/metricsmanager"
    43  	"github.com/juju/juju/apiserver"
    44  	"github.com/juju/juju/apiserver/params"
    45  	"github.com/juju/juju/cert"
    46  	"github.com/juju/juju/cmd/jujud/agent/machine"
    47  	"github.com/juju/juju/cmd/jujud/agent/model"
    48  	"github.com/juju/juju/cmd/jujud/reboot"
    49  	cmdutil "github.com/juju/juju/cmd/jujud/util"
    50  	"github.com/juju/juju/container"
    51  	"github.com/juju/juju/container/kvm"
    52  	"github.com/juju/juju/environs"
    53  	"github.com/juju/juju/environs/simplestreams"
    54  	"github.com/juju/juju/instance"
    55  	jujunames "github.com/juju/juju/juju/names"
    56  	"github.com/juju/juju/juju/paths"
    57  	"github.com/juju/juju/mongo"
    58  	"github.com/juju/juju/network"
    59  	"github.com/juju/juju/service"
    60  	"github.com/juju/juju/service/common"
    61  	"github.com/juju/juju/state"
    62  	"github.com/juju/juju/state/multiwatcher"
    63  	"github.com/juju/juju/storage/looputil"
    64  	"github.com/juju/juju/upgrades"
    65  	jujuversion "github.com/juju/juju/version"
    66  	"github.com/juju/juju/watcher"
    67  	"github.com/juju/juju/worker"
    68  	"github.com/juju/juju/worker/apicaller"
    69  	"github.com/juju/juju/worker/certupdater"
    70  	"github.com/juju/juju/worker/conv2state"
    71  	"github.com/juju/juju/worker/dblogpruner"
    72  	"github.com/juju/juju/worker/dependency"
    73  	"github.com/juju/juju/worker/deployer"
    74  	"github.com/juju/juju/worker/gate"
    75  	"github.com/juju/juju/worker/imagemetadataworker"
    76  	"github.com/juju/juju/worker/logsender"
    77  	"github.com/juju/juju/worker/modelworkermanager"
    78  	"github.com/juju/juju/worker/mongoupgrader"
    79  	"github.com/juju/juju/worker/peergrouper"
    80  	"github.com/juju/juju/worker/provisioner"
    81  	"github.com/juju/juju/worker/singular"
    82  	"github.com/juju/juju/worker/txnpruner"
    83  	"github.com/juju/juju/worker/upgradesteps"
    84  )
    85  
    86  var (
    87  	logger       = loggo.GetLogger("juju.cmd.jujud")
    88  	jujuRun      = paths.MustSucceed(paths.JujuRun(series.HostSeries()))
    89  	jujuDumpLogs = paths.MustSucceed(paths.JujuDumpLogs(series.HostSeries()))
    90  
    91  	// The following are defined as variables to allow the tests to
    92  	// intercept calls to the functions.
    93  	useMultipleCPUs       = utils.UseMultipleCPUs
    94  	modelManifolds        = model.Manifolds
    95  	newSingularRunner     = singular.New
    96  	peergrouperNew        = peergrouper.New
    97  	newCertificateUpdater = certupdater.NewCertificateUpdater
    98  	newMetadataUpdater    = imagemetadataworker.NewWorker
    99  	newUpgradeMongoWorker = mongoupgrader.New
   100  	reportOpenedState     = func(io.Closer) {}
   101  )
   102  
   103  // Variable to override in tests, default is true
   104  var ProductionMongoWriteConcern = true
   105  
   106  func init() {
   107  	stateWorkerDialOpts = mongo.DefaultDialOpts()
   108  	stateWorkerDialOpts.PostDial = func(session *mgo.Session) error {
   109  		safe := mgo.Safe{}
   110  		if ProductionMongoWriteConcern {
   111  			safe.J = true
   112  			_, err := replicaset.CurrentConfig(session)
   113  			if err == nil {
   114  				// set mongo to write-majority (writes only returned after
   115  				// replicated to a majority of replica-set members).
   116  				safe.WMode = "majority"
   117  			}
   118  		}
   119  		session.SetSafe(&safe)
   120  		return nil
   121  	}
   122  }
   123  
   124  // AgentInitializer handles initializing a type for use as a Jujud
   125  // agent.
   126  type AgentInitializer interface {
   127  	AddFlags(*gnuflag.FlagSet)
   128  	CheckArgs([]string) error
   129  }
   130  
   131  // AgentConfigWriter encapsulates disk I/O operations with the agent
   132  // config.
   133  type AgentConfigWriter interface {
   134  	// ReadConfig reads the config for the given tag from disk.
   135  	ReadConfig(tag string) error
   136  	// ChangeConfig executes the given agent.ConfigMutator in a
   137  	// thread-safe context.
   138  	ChangeConfig(agent.ConfigMutator) error
   139  	// CurrentConfig returns a copy of the in-memory agent config.
   140  	CurrentConfig() agent.Config
   141  }
   142  
   143  // NewMachineAgentCmd creates a Command which handles parsing
   144  // command-line arguments and instantiating and running a
   145  // MachineAgent.
   146  func NewMachineAgentCmd(
   147  	ctx *cmd.Context,
   148  	machineAgentFactory func(string) *MachineAgent,
   149  	agentInitializer AgentInitializer,
   150  	configFetcher AgentConfigWriter,
   151  ) cmd.Command {
   152  	return &machineAgentCmd{
   153  		ctx:                 ctx,
   154  		machineAgentFactory: machineAgentFactory,
   155  		agentInitializer:    agentInitializer,
   156  		currentConfig:       configFetcher,
   157  	}
   158  }
   159  
   160  type machineAgentCmd struct {
   161  	cmd.CommandBase
   162  
   163  	// This group of arguments is required.
   164  	agentInitializer    AgentInitializer
   165  	currentConfig       AgentConfigWriter
   166  	machineAgentFactory func(string) *MachineAgent
   167  	ctx                 *cmd.Context
   168  
   169  	// This group is for debugging purposes.
   170  	logToStdErr bool
   171  
   172  	// The following are set via command-line flags.
   173  	machineId string
   174  }
   175  
   176  // Init is called by the cmd system to initialize the structure for
   177  // running.
   178  func (a *machineAgentCmd) Init(args []string) error {
   179  
   180  	if !names.IsValidMachine(a.machineId) {
   181  		return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer")
   182  	}
   183  	if err := a.agentInitializer.CheckArgs(args); err != nil {
   184  		return err
   185  	}
   186  
   187  	// Due to changes in the logging, and needing to care about old
   188  	// models that have been upgraded, we need to explicitly remove the
   189  	// file writer if one has been added, otherwise we will get duplicate
   190  	// lines of all logging in the log file.
   191  	loggo.RemoveWriter("logfile")
   192  
   193  	if a.logToStdErr {
   194  		return nil
   195  	}
   196  
   197  	err := a.currentConfig.ReadConfig(names.NewMachineTag(a.machineId).String())
   198  	if err != nil {
   199  		return errors.Annotate(err, "cannot read agent configuration")
   200  	}
   201  
   202  	// the context's stderr is set as the loggo writer in github.com/juju/cmd/logging.go
   203  	a.ctx.Stderr = &lumberjack.Logger{
   204  		Filename:   agent.LogFilename(a.currentConfig.CurrentConfig()),
   205  		MaxSize:    300, // megabytes
   206  		MaxBackups: 2,
   207  	}
   208  
   209  	return nil
   210  }
   211  
   212  // Run instantiates a MachineAgent and runs it.
   213  func (a *machineAgentCmd) Run(c *cmd.Context) error {
   214  	machineAgent := a.machineAgentFactory(a.machineId)
   215  	return machineAgent.Run(c)
   216  }
   217  
   218  // SetFlags adds the requisite flags to run this command.
   219  func (a *machineAgentCmd) SetFlags(f *gnuflag.FlagSet) {
   220  	a.agentInitializer.AddFlags(f)
   221  	f.StringVar(&a.machineId, "machine-id", "", "id of the machine to run")
   222  }
   223  
   224  // Info returns usage information for the command.
   225  func (a *machineAgentCmd) Info() *cmd.Info {
   226  	return &cmd.Info{
   227  		Name:    "machine",
   228  		Purpose: "run a juju machine agent",
   229  	}
   230  }
   231  
   232  // MachineAgentFactoryFn returns a function which instantiates a
   233  // MachineAgent given a machineId.
   234  func MachineAgentFactoryFn(
   235  	agentConfWriter AgentConfigWriter,
   236  	bufferedLogs logsender.LogRecordCh,
   237  	rootDir string,
   238  ) func(string) *MachineAgent {
   239  	return func(machineId string) *MachineAgent {
   240  		return NewMachineAgent(
   241  			machineId,
   242  			agentConfWriter,
   243  			bufferedLogs,
   244  			worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant, worker.RestartDelay),
   245  			looputil.NewLoopDeviceManager(),
   246  			rootDir,
   247  		)
   248  	}
   249  }
   250  
   251  // NewMachineAgent instantiates a new MachineAgent.
   252  func NewMachineAgent(
   253  	machineId string,
   254  	agentConfWriter AgentConfigWriter,
   255  	bufferedLogs logsender.LogRecordCh,
   256  	runner worker.Runner,
   257  	loopDeviceManager looputil.LoopDeviceManager,
   258  	rootDir string,
   259  ) *MachineAgent {
   260  	return &MachineAgent{
   261  		machineId:                   machineId,
   262  		AgentConfigWriter:           agentConfWriter,
   263  		configChangedVal:            voyeur.NewValue(true),
   264  		bufferedLogs:                bufferedLogs,
   265  		workersStarted:              make(chan struct{}),
   266  		runner:                      runner,
   267  		rootDir:                     rootDir,
   268  		initialUpgradeCheckComplete: gate.NewLock(),
   269  		loopDeviceManager:           loopDeviceManager,
   270  	}
   271  }
   272  
   273  // MachineAgent is responsible for tying together all functionality
   274  // needed to orchestrate a Jujud instance which controls a machine.
   275  type MachineAgent struct {
   276  	AgentConfigWriter
   277  
   278  	tomb             tomb.Tomb
   279  	machineId        string
   280  	runner           worker.Runner
   281  	rootDir          string
   282  	bufferedLogs     logsender.LogRecordCh
   283  	configChangedVal *voyeur.Value
   284  	upgradeComplete  gate.Lock
   285  	workersStarted   chan struct{}
   286  
   287  	// XXX(fwereade): these smell strongly of goroutine-unsafeness.
   288  	restoreMode bool
   289  	restoring   bool
   290  
   291  	// Used to signal that the upgrade worker will not
   292  	// reboot the agent on startup because there are no
   293  	// longer any immediately pending agent upgrades.
   294  	initialUpgradeCheckComplete gate.Lock
   295  
   296  	discoverSpacesComplete gate.Lock
   297  
   298  	mongoInitMutex   sync.Mutex
   299  	mongoInitialized bool
   300  
   301  	loopDeviceManager looputil.LoopDeviceManager
   302  }
   303  
   304  // IsRestorePreparing returns bool representing if we are in restore mode
   305  // but not running restore.
   306  func (a *MachineAgent) IsRestorePreparing() bool {
   307  	return a.restoreMode && !a.restoring
   308  }
   309  
   310  // IsRestoreRunning returns bool representing if we are in restore mode
   311  // and running the actual restore process.
   312  func (a *MachineAgent) IsRestoreRunning() bool {
   313  	return a.restoring
   314  }
   315  
   316  func (a *MachineAgent) isUpgradeRunning() bool {
   317  	return !a.upgradeComplete.IsUnlocked()
   318  }
   319  
   320  func (a *MachineAgent) isInitialUpgradeCheckPending() bool {
   321  	return !a.initialUpgradeCheckComplete.IsUnlocked()
   322  }
   323  
   324  // Wait waits for the machine agent to finish.
   325  func (a *MachineAgent) Wait() error {
   326  	return a.tomb.Wait()
   327  }
   328  
   329  // Stop stops the machine agent.
   330  func (a *MachineAgent) Stop() error {
   331  	a.runner.Kill()
   332  	return a.tomb.Wait()
   333  }
   334  
   335  // upgradeCertificateDNSNames ensure that the controller certificate
   336  // recorded in the agent config and also mongo server.pem contains the
   337  // DNSNames entires required by Juju/
   338  func (a *MachineAgent) upgradeCertificateDNSNames() error {
   339  	agentConfig := a.CurrentConfig()
   340  	si, ok := agentConfig.StateServingInfo()
   341  	if !ok || si.CAPrivateKey == "" {
   342  		// No certificate information exists yet, nothing to do.
   343  		return nil
   344  	}
   345  	// Parse the current certificate to get the current dns names.
   346  	serverCert, err := cert.ParseCert(si.Cert)
   347  	if err != nil {
   348  		return err
   349  	}
   350  	update := false
   351  	dnsNames := set.NewStrings(serverCert.DNSNames...)
   352  	requiredDNSNames := []string{"local", "juju-apiserver", "juju-mongodb"}
   353  	for _, dnsName := range requiredDNSNames {
   354  		if dnsNames.Contains(dnsName) {
   355  			continue
   356  		}
   357  		dnsNames.Add(dnsName)
   358  		update = true
   359  	}
   360  	if !update {
   361  		return nil
   362  	}
   363  	// Write a new certificate to the mongo pem and agent config files.
   364  	si.Cert, si.PrivateKey, err = cert.NewDefaultServer(agentConfig.CACert(), si.CAPrivateKey, dnsNames.Values())
   365  	if err != nil {
   366  		return err
   367  	}
   368  	if err := mongo.UpdateSSLKey(agentConfig.DataDir(), si.Cert, si.PrivateKey); err != nil {
   369  		return err
   370  	}
   371  	return a.AgentConfigWriter.ChangeConfig(func(config agent.ConfigSetter) error {
   372  		config.SetStateServingInfo(si)
   373  		return nil
   374  	})
   375  }
   376  
   377  // Run runs a machine agent.
   378  func (a *MachineAgent) Run(*cmd.Context) error {
   379  
   380  	defer a.tomb.Done()
   381  	if err := a.ReadConfig(a.Tag().String()); err != nil {
   382  		return fmt.Errorf("cannot read agent configuration: %v", err)
   383  	}
   384  
   385  	logger.Infof("machine agent %v start (%s [%s])", a.Tag(), jujuversion.Current, runtime.Compiler)
   386  	if flags := featureflag.String(); flags != "" {
   387  		logger.Warningf("developer feature flags enabled: %s", flags)
   388  	}
   389  
   390  	// Before doing anything else, we need to make sure the certificate generated for
   391  	// use by mongo to validate controller connections is correct. This needs to be done
   392  	// before any possible restart of the mongo service.
   393  	// See bug http://pad.lv/1434680
   394  	if err := a.upgradeCertificateDNSNames(); err != nil {
   395  		return errors.Annotate(err, "error upgrading server certificate")
   396  	}
   397  
   398  	if upgradeComplete, err := upgradesteps.NewLock(a); err != nil {
   399  		return errors.Annotate(err, "error during creating upgrade completion channel")
   400  	} else {
   401  		a.upgradeComplete = upgradeComplete
   402  	}
   403  
   404  	agentConfig := a.CurrentConfig()
   405  	createEngine := a.makeEngineCreator(agentConfig.UpgradedToVersion())
   406  	network.SetPreferIPv6(agentConfig.PreferIPv6())
   407  	charmrepo.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache")
   408  	if err := a.createJujudSymlinks(agentConfig.DataDir()); err != nil {
   409  		return err
   410  	}
   411  	a.runner.StartWorker("engine", createEngine)
   412  
   413  	// At this point, all workers will have been configured to start
   414  	close(a.workersStarted)
   415  	err := a.runner.Wait()
   416  	switch errors.Cause(err) {
   417  	case worker.ErrTerminateAgent:
   418  		err = a.uninstallAgent()
   419  	case worker.ErrRebootMachine:
   420  		logger.Infof("Caught reboot error")
   421  		err = a.executeRebootOrShutdown(params.ShouldReboot)
   422  	case worker.ErrShutdownMachine:
   423  		logger.Infof("Caught shutdown error")
   424  		err = a.executeRebootOrShutdown(params.ShouldShutdown)
   425  	}
   426  	err = cmdutil.AgentDone(logger, err)
   427  	a.tomb.Kill(err)
   428  	return err
   429  }
   430  
   431  func (a *MachineAgent) makeEngineCreator(previousAgentVersion version.Number) func() (worker.Worker, error) {
   432  	return func() (worker.Worker, error) {
   433  		config := dependency.EngineConfig{
   434  			IsFatal:     cmdutil.IsFatal,
   435  			WorstError:  cmdutil.MoreImportantError,
   436  			ErrorDelay:  3 * time.Second,
   437  			BounceDelay: 10 * time.Millisecond,
   438  		}
   439  		engine, err := dependency.NewEngine(config)
   440  		if err != nil {
   441  			return nil, err
   442  		}
   443  		manifolds := machine.Manifolds(machine.ManifoldsConfig{
   444  			PreviousAgentVersion: previousAgentVersion,
   445  			Agent:                agent.APIHostPortsSetter{Agent: a},
   446  			RootDir:              a.rootDir,
   447  			AgentConfigChanged:   a.configChangedVal,
   448  			UpgradeStepsLock:     a.upgradeComplete,
   449  			UpgradeCheckLock:     a.initialUpgradeCheckComplete,
   450  			OpenState:            a.initState,
   451  			OpenStateForUpgrade:  a.openStateForUpgrade,
   452  			StartStateWorkers:    a.startStateWorkers,
   453  			StartAPIWorkers:      a.startAPIWorkers,
   454  			PreUpgradeSteps:      upgrades.PreUpgradeSteps,
   455  			LogSource:            a.bufferedLogs,
   456  			NewDeployContext:     newDeployContext,
   457  			Clock:                clock.WallClock,
   458  		})
   459  		if err := dependency.Install(engine, manifolds); err != nil {
   460  			if err := worker.Stop(engine); err != nil {
   461  				logger.Errorf("while stopping engine with bad manifolds: %v", err)
   462  			}
   463  			return nil, err
   464  		}
   465  		return engine, nil
   466  	}
   467  }
   468  
   469  func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error {
   470  	// At this stage, all API connections would have been closed
   471  	// We need to reopen the API to clear the reboot flag after
   472  	// scheduling the reboot. It may be cleaner to do this in the reboot
   473  	// worker, before returning the ErrRebootMachine.
   474  	conn, err := apicaller.OnlyConnect(a, apicaller.APIOpen)
   475  	if err != nil {
   476  		logger.Infof("Reboot: Error connecting to state")
   477  		return errors.Trace(err)
   478  	}
   479  
   480  	// block until all units/containers are ready, and reboot/shutdown
   481  	finalize, err := reboot.NewRebootWaiter(conn, a.CurrentConfig())
   482  	if err != nil {
   483  		return errors.Trace(err)
   484  	}
   485  
   486  	logger.Infof("Reboot: Executing reboot")
   487  	err = finalize.ExecuteReboot(action)
   488  	if err != nil {
   489  		logger.Infof("Reboot: Error executing reboot: %v", err)
   490  		return errors.Trace(err)
   491  	}
   492  	// On windows, the shutdown command is asynchronous. We return ErrRebootMachine
   493  	// so the agent will simply exit without error pending reboot/shutdown.
   494  	return worker.ErrRebootMachine
   495  }
   496  
   497  func (a *MachineAgent) ChangeConfig(mutate agent.ConfigMutator) error {
   498  	err := a.AgentConfigWriter.ChangeConfig(mutate)
   499  	a.configChangedVal.Set(true)
   500  	return errors.Trace(err)
   501  }
   502  
   503  func (a *MachineAgent) maybeStopMongo(ver mongo.Version, isMaster bool) error {
   504  	if !a.mongoInitialized {
   505  		return nil
   506  	}
   507  
   508  	conf := a.AgentConfigWriter.CurrentConfig()
   509  	v := conf.MongoVersion()
   510  
   511  	logger.Errorf("Got version change %v", ver)
   512  	// TODO(perrito666) replace with "read-only" mode for environment when
   513  	// it is available.
   514  	if ver.NewerThan(v) > 0 {
   515  		err := a.AgentConfigWriter.ChangeConfig(func(config agent.ConfigSetter) error {
   516  			config.SetMongoVersion(mongo.MongoUpgrade)
   517  			return nil
   518  		})
   519  		if err != nil {
   520  			return err
   521  		}
   522  
   523  	}
   524  	return nil
   525  
   526  }
   527  
   528  // PrepareRestore will flag the agent to allow only a limited set
   529  // of commands defined in
   530  // "github.com/juju/juju/apiserver".allowedMethodsAboutToRestore
   531  // the most noteworthy is:
   532  // Backups.Restore: this will ensure that we can do all the file movements
   533  // required for restore and no one will do changes while we do that.
   534  // it will return error if the machine is already in this state.
   535  func (a *MachineAgent) PrepareRestore() error {
   536  	if a.restoreMode {
   537  		return errors.Errorf("already in restore mode")
   538  	}
   539  	a.restoreMode = true
   540  	return nil
   541  }
   542  
   543  // BeginRestore will flag the agent to disallow all commands since
   544  // restore should be running and therefore making changes that
   545  // would override anything done.
   546  func (a *MachineAgent) BeginRestore() error {
   547  	switch {
   548  	case !a.restoreMode:
   549  		return errors.Errorf("not in restore mode, cannot begin restoration")
   550  	case a.restoring:
   551  		return errors.Errorf("already restoring")
   552  	}
   553  	a.restoring = true
   554  	return nil
   555  }
   556  
   557  // EndRestore will flag the agent to allow all commands
   558  // This being invoked means that restore process failed
   559  // since success restarts the agent.
   560  func (a *MachineAgent) EndRestore() {
   561  	a.restoreMode = false
   562  	a.restoring = false
   563  }
   564  
   565  // newRestoreStateWatcherWorker will return a worker or err if there
   566  // is a failure, the worker takes care of watching the state of
   567  // restoreInfo doc and put the agent in the different restore modes.
   568  func (a *MachineAgent) newRestoreStateWatcherWorker(st *state.State) (worker.Worker, error) {
   569  	rWorker := func(stopch <-chan struct{}) error {
   570  		return a.restoreStateWatcher(st, stopch)
   571  	}
   572  	return worker.NewSimpleWorker(rWorker), nil
   573  }
   574  
   575  // restoreChanged will be called whenever restoreInfo doc changes signaling a new
   576  // step in the restore process.
   577  func (a *MachineAgent) restoreChanged(st *state.State) error {
   578  	rinfo, err := st.RestoreInfoSetter()
   579  	if err != nil {
   580  		return errors.Annotate(err, "cannot read restore state")
   581  	}
   582  	switch rinfo.Status() {
   583  	case state.RestorePending:
   584  		a.PrepareRestore()
   585  	case state.RestoreInProgress:
   586  		a.BeginRestore()
   587  	case state.RestoreFailed:
   588  		a.EndRestore()
   589  	}
   590  	return nil
   591  }
   592  
   593  // restoreStateWatcher watches for restoreInfo looking for changes in the restore process.
   594  func (a *MachineAgent) restoreStateWatcher(st *state.State, stopch <-chan struct{}) error {
   595  	restoreWatch := st.WatchRestoreInfoChanges()
   596  	defer func() {
   597  		restoreWatch.Kill()
   598  		restoreWatch.Wait()
   599  	}()
   600  
   601  	for {
   602  		select {
   603  		case <-restoreWatch.Changes():
   604  			if err := a.restoreChanged(st); err != nil {
   605  				return err
   606  			}
   607  		case <-stopch:
   608  			return nil
   609  		}
   610  	}
   611  }
   612  
   613  var newEnvirons = environs.New
   614  
   615  // startAPIWorkers is called to start workers which rely on the
   616  // machine agent's API connection (via the apiworkers manifold). It
   617  // returns a Runner with a number of workers attached to it.
   618  //
   619  // The workers started here need to be converted to run under the
   620  // dependency engine. Once they have all been converted, this method -
   621  // and the apiworkers manifold - can be removed.
   622  func (a *MachineAgent) startAPIWorkers(apiConn api.Connection) (_ worker.Worker, outErr error) {
   623  	agentConfig := a.CurrentConfig()
   624  
   625  	entity, err := apiagent.NewState(apiConn).Entity(a.Tag())
   626  	if err != nil {
   627  		return nil, errors.Trace(err)
   628  	}
   629  
   630  	var isModelManager bool
   631  	for _, job := range entity.Jobs() {
   632  		switch job {
   633  		case multiwatcher.JobManageModel:
   634  			isModelManager = true
   635  		default:
   636  			// TODO(dimitern): Once all workers moved over to using
   637  			// the API, report "unknown job type" here.
   638  		}
   639  	}
   640  
   641  	runner := newConnRunner(apiConn)
   642  	defer func() {
   643  		// If startAPIWorkers exits early with an error, stop the
   644  		// runner so that any already started runners aren't leaked.
   645  		if outErr != nil {
   646  			worker.Stop(runner)
   647  		}
   648  	}()
   649  
   650  	modelConfig, err := apiagent.NewState(apiConn).ModelConfig()
   651  	if err != nil {
   652  		return nil, fmt.Errorf("cannot read model config: %v", err)
   653  	}
   654  
   655  	// Perform the operations needed to set up hosting for containers.
   656  	if err := a.setupContainerSupport(runner, apiConn, agentConfig); err != nil {
   657  		cause := errors.Cause(err)
   658  		if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent {
   659  			return nil, worker.ErrTerminateAgent
   660  		}
   661  		return nil, fmt.Errorf("setting up container support: %v", err)
   662  	}
   663  
   664  	if isModelManager {
   665  
   666  		// Published image metadata for some providers are in simple streams.
   667  		// Providers that do not depend on simple streams do not need this worker.
   668  		env, err := newEnvirons(modelConfig)
   669  		if err != nil {
   670  			return nil, errors.Annotate(err, "getting environ")
   671  		}
   672  		if _, ok := env.(simplestreams.HasRegion); ok {
   673  			// Start worker that stores published image metadata in state.
   674  			runner.StartWorker("imagemetadata", func() (worker.Worker, error) {
   675  				return newMetadataUpdater(apiConn.MetadataUpdater()), nil
   676  			})
   677  		}
   678  
   679  		// We don't have instance info set and the network config for the
   680  		// bootstrap machine only, so update it now. All the other machines will
   681  		// have instance info including network config set at provisioning time.
   682  		if err := a.setControllerNetworkConfig(apiConn); err != nil {
   683  			return nil, errors.Annotate(err, "setting controller network config")
   684  		}
   685  	} else {
   686  		runner.StartWorker("stateconverter", func() (worker.Worker, error) {
   687  			// TODO(fwereade): this worker needs its own facade.
   688  			facade := apimachiner.NewState(apiConn)
   689  			handler := conv2state.New(facade, a)
   690  			w, err := watcher.NewNotifyWorker(watcher.NotifyConfig{
   691  				Handler: handler,
   692  			})
   693  			if err != nil {
   694  				return nil, errors.Annotate(err, "cannot start controller promoter worker")
   695  			}
   696  			return w, nil
   697  		})
   698  	}
   699  	return runner, nil
   700  }
   701  
   702  func (a *MachineAgent) setControllerNetworkConfig(apiConn api.Connection) error {
   703  	machinerAPI := apimachiner.NewState(apiConn)
   704  	agentConfig := a.CurrentConfig()
   705  
   706  	tag := agentConfig.Tag().(names.MachineTag)
   707  	machine, err := machinerAPI.Machine(tag)
   708  	if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead {
   709  		return worker.ErrTerminateAgent
   710  	}
   711  	if err != nil {
   712  		return errors.Annotatef(err, "cannot load machine %s from state", tag)
   713  	}
   714  
   715  	if err := machine.SetProviderNetworkConfig(); err != nil {
   716  		return errors.Annotate(err, "cannot set controller provider network config")
   717  	}
   718  	return nil
   719  }
   720  
   721  // Restart restarts the agent's service.
   722  func (a *MachineAgent) Restart() error {
   723  	name := a.CurrentConfig().Value(agent.AgentServiceName)
   724  	return service.Restart(name)
   725  }
   726  
   727  // openStateForUpgrade exists to be passed into the upgradesteps
   728  // worker. The upgradesteps worker opens state independently of the
   729  // state worker so that it isn't affected by the state worker's
   730  // lifetime. It ensures the MongoDB server is configured and started,
   731  // and then opens a state connection.
   732  //
   733  // TODO(mjs)- review the need for this once the dependency engine is
   734  // in use. Why can't upgradesteps depend on the main state connection?
   735  func (a *MachineAgent) openStateForUpgrade() (*state.State, error) {
   736  	agentConfig := a.CurrentConfig()
   737  	if err := a.ensureMongoServer(agentConfig); err != nil {
   738  		return nil, errors.Trace(err)
   739  	}
   740  	info, ok := agentConfig.MongoInfo()
   741  	if !ok {
   742  		return nil, errors.New("no state info available")
   743  	}
   744  	st, err := state.Open(agentConfig.Model(), info, mongo.DefaultDialOpts(), environs.NewStatePolicy())
   745  	if err != nil {
   746  		return nil, errors.Trace(err)
   747  	}
   748  	return st, nil
   749  }
   750  
   751  // setupContainerSupport determines what containers can be run on this machine and
   752  // initialises suitable infrastructure to support such containers.
   753  func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connection, agentConfig agent.Config) error {
   754  	var supportedContainers []instance.ContainerType
   755  	supportsContainers := container.ContainersSupported()
   756  	if supportsContainers {
   757  		supportedContainers = append(supportedContainers, instance.LXC, instance.LXD)
   758  	}
   759  
   760  	supportsKvm, err := kvm.IsKVMSupported()
   761  	if err != nil {
   762  		logger.Warningf("determining kvm support: %v\nno kvm containers possible", err)
   763  	}
   764  	if err == nil && supportsKvm {
   765  		supportedContainers = append(supportedContainers, instance.KVM)
   766  	}
   767  
   768  	return a.updateSupportedContainers(runner, st, supportedContainers, agentConfig)
   769  }
   770  
   771  // updateSupportedContainers records in state that a machine can run the specified containers.
   772  // It starts a watcher and when a container of a given type is first added to the machine,
   773  // the watcher is killed, the machine is set up to be able to start containers of the given type,
   774  // and a suitable provisioner is started.
   775  func (a *MachineAgent) updateSupportedContainers(
   776  	runner worker.Runner,
   777  	st api.Connection,
   778  	containers []instance.ContainerType,
   779  	agentConfig agent.Config,
   780  ) error {
   781  	pr := st.Provisioner()
   782  	tag := agentConfig.Tag().(names.MachineTag)
   783  	machine, err := pr.Machine(tag)
   784  	if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead {
   785  		return worker.ErrTerminateAgent
   786  	}
   787  	if err != nil {
   788  		return errors.Annotatef(err, "cannot load machine %s from state", tag)
   789  	}
   790  	if len(containers) == 0 {
   791  		if err := machine.SupportsNoContainers(); err != nil {
   792  			return errors.Annotatef(err, "clearing supported containers for %s", tag)
   793  		}
   794  		return nil
   795  	}
   796  	if err := machine.SetSupportedContainers(containers...); err != nil {
   797  		return errors.Annotatef(err, "setting supported containers for %s", tag)
   798  	}
   799  	initLock, err := cmdutil.HookExecutionLock(agentConfig.DataDir())
   800  	if err != nil {
   801  		return err
   802  	}
   803  	// Start the watcher to fire when a container is first requested on the machine.
   804  	modelUUID, err := st.ModelTag()
   805  	if err != nil {
   806  		return err
   807  	}
   808  	watcherName := fmt.Sprintf("%s-container-watcher", machine.Id())
   809  	// There may not be a CA certificate private key available, and without
   810  	// it we can't ensure that other Juju nodes can connect securely, so only
   811  	// use an image URL getter if there's a private key.
   812  	var imageURLGetter container.ImageURLGetter
   813  	if agentConfig.Value(agent.AllowsSecureConnection) == "true" {
   814  		cfg, err := pr.ModelConfig()
   815  		if err != nil {
   816  			return errors.Annotate(err, "unable to get environ config")
   817  		}
   818  		imageURLGetter = container.NewImageURLGetter(
   819  			// Explicitly call the non-named constructor so if anyone
   820  			// adds additional fields, this fails.
   821  			container.ImageURLGetterConfig{
   822  				ServerRoot:        st.Addr(),
   823  				ModelUUID:         modelUUID.Id(),
   824  				CACert:            []byte(agentConfig.CACert()),
   825  				CloudimgBaseUrl:   cfg.CloudImageBaseURL(),
   826  				Stream:            cfg.ImageStream(),
   827  				ImageDownloadFunc: container.ImageDownloadURL,
   828  			})
   829  	}
   830  	params := provisioner.ContainerSetupParams{
   831  		Runner:              runner,
   832  		WorkerName:          watcherName,
   833  		SupportedContainers: containers,
   834  		ImageURLGetter:      imageURLGetter,
   835  		Machine:             machine,
   836  		Provisioner:         pr,
   837  		Config:              agentConfig,
   838  		InitLock:            initLock,
   839  	}
   840  	handler := provisioner.NewContainerSetupHandler(params)
   841  	a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) {
   842  		w, err := watcher.NewStringsWorker(watcher.StringsConfig{
   843  			Handler: handler,
   844  		})
   845  		if err != nil {
   846  			return nil, errors.Annotatef(err, "cannot start %s worker", watcherName)
   847  		}
   848  		return w, nil
   849  	})
   850  	return nil
   851  }
   852  
   853  func (a *MachineAgent) initState(agentConfig agent.Config) (*state.State, error) {
   854  	// Start MongoDB server and dial.
   855  	if err := a.ensureMongoServer(agentConfig); err != nil {
   856  		return nil, err
   857  	}
   858  
   859  	st, _, err := openState(agentConfig, stateWorkerDialOpts)
   860  	if err != nil {
   861  		return nil, err
   862  	}
   863  
   864  	reportOpenedState(st)
   865  
   866  	return st, nil
   867  }
   868  
   869  // startStateWorkers returns a worker running all the workers that
   870  // require a *state.State connection.
   871  func (a *MachineAgent) startStateWorkers(st *state.State) (worker.Worker, error) {
   872  	agentConfig := a.CurrentConfig()
   873  
   874  	m, err := getMachine(st, agentConfig.Tag())
   875  	if err != nil {
   876  		return nil, errors.Annotate(err, "machine lookup")
   877  	}
   878  
   879  	runner := newConnRunner(st)
   880  	singularRunner, err := newSingularStateRunner(runner, st, m)
   881  	if err != nil {
   882  		return nil, errors.Trace(err)
   883  	}
   884  
   885  	for _, job := range m.Jobs() {
   886  		switch job {
   887  		case state.JobHostUnits:
   888  			// Implemented elsewhere with workers that use the API.
   889  		case state.JobManageNetworking:
   890  			// Not used by state workers.
   891  		case state.JobManageModel:
   892  			useMultipleCPUs()
   893  			a.startWorkerAfterUpgrade(runner, "model worker manager", func() (worker.Worker, error) {
   894  				w, err := modelworkermanager.New(modelworkermanager.Config{
   895  					Backend:    st,
   896  					NewWorker:  a.startModelWorkers,
   897  					ErrorDelay: worker.RestartDelay,
   898  				})
   899  				if err != nil {
   900  					return nil, errors.Annotate(err, "cannot start model worker manager")
   901  				}
   902  				return w, nil
   903  			})
   904  			a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) {
   905  				w, err := peergrouperNew(st)
   906  				if err != nil {
   907  					return nil, errors.Annotate(err, "cannot start peergrouper worker")
   908  				}
   909  				return w, nil
   910  			})
   911  			a.startWorkerAfterUpgrade(runner, "restore", func() (worker.Worker, error) {
   912  				w, err := a.newRestoreStateWatcherWorker(st)
   913  				if err != nil {
   914  					return nil, errors.Annotate(err, "cannot start backup-restorer worker")
   915  				}
   916  				return w, nil
   917  			})
   918  			a.startWorkerAfterUpgrade(runner, "mongoupgrade", func() (worker.Worker, error) {
   919  				return newUpgradeMongoWorker(st, a.machineId, a.maybeStopMongo)
   920  			})
   921  
   922  			// certChangedChan is shared by multiple workers it's up
   923  			// to the agent to close it rather than any one of the
   924  			// workers.  It is possible that multiple cert changes
   925  			// come in before the apiserver is up to receive them.
   926  			// Specify a bigger buffer to prevent deadlock when
   927  			// the apiserver isn't up yet.  Use a size of 10 since we
   928  			// allow up to 7 controllers, and might also update the
   929  			// addresses of the local machine (127.0.0.1, ::1, etc).
   930  			//
   931  			// TODO(cherylj/waigani) Remove this workaround when
   932  			// certupdater and apiserver can properly manage dependencies
   933  			// through the dependency engine.
   934  			//
   935  			// TODO(ericsnow) For now we simply do not close the channel.
   936  			certChangedChan := make(chan params.StateServingInfo, 10)
   937  			// Each time apiserver worker is restarted, we need a fresh copy of state due
   938  			// to the fact that state holds lease managers which are killed and need to be reset.
   939  			stateOpener := func() (*state.State, error) {
   940  				logger.Debugf("opening state for apiserver worker")
   941  				st, _, err := openState(agentConfig, stateWorkerDialOpts)
   942  				return st, err
   943  			}
   944  			runner.StartWorker("apiserver", a.apiserverWorkerStarter(stateOpener, certChangedChan))
   945  			var stateServingSetter certupdater.StateServingInfoSetter = func(info params.StateServingInfo, done <-chan struct{}) error {
   946  				return a.ChangeConfig(func(config agent.ConfigSetter) error {
   947  					config.SetStateServingInfo(info)
   948  					logger.Infof("update apiserver worker with new certificate")
   949  					select {
   950  					case certChangedChan <- info:
   951  						return nil
   952  					case <-done:
   953  						return nil
   954  					}
   955  				})
   956  			}
   957  			a.startWorkerAfterUpgrade(runner, "certupdater", func() (worker.Worker, error) {
   958  				return newCertificateUpdater(m, agentConfig, st, st, stateServingSetter), nil
   959  			})
   960  
   961  			a.startWorkerAfterUpgrade(singularRunner, "dblogpruner", func() (worker.Worker, error) {
   962  				return dblogpruner.New(st, dblogpruner.NewLogPruneParams()), nil
   963  			})
   964  
   965  			a.startWorkerAfterUpgrade(singularRunner, "txnpruner", func() (worker.Worker, error) {
   966  				return txnpruner.New(st, time.Hour*2), nil
   967  			})
   968  		default:
   969  			return nil, errors.Errorf("unknown job type %q", job)
   970  		}
   971  	}
   972  	return runner, nil
   973  }
   974  
   975  // startModelWorkers starts the set of workers that run for every model
   976  // in each controller.
   977  func (a *MachineAgent) startModelWorkers(uuid string) (worker.Worker, error) {
   978  	modelAgent, err := model.WrapAgent(a, uuid)
   979  	if err != nil {
   980  		return nil, errors.Trace(err)
   981  	}
   982  
   983  	engine, err := dependency.NewEngine(dependency.EngineConfig{
   984  		IsFatal:     model.IsFatal,
   985  		WorstError:  model.WorstError,
   986  		Filter:      model.IgnoreErrRemoved,
   987  		ErrorDelay:  3 * time.Second,
   988  		BounceDelay: 10 * time.Millisecond,
   989  	})
   990  	if err != nil {
   991  		return nil, errors.Trace(err)
   992  	}
   993  
   994  	manifolds := modelManifolds(model.ManifoldsConfig{
   995  		Agent:                       modelAgent,
   996  		AgentConfigChanged:          a.configChangedVal,
   997  		Clock:                       clock.WallClock,
   998  		RunFlagDuration:             time.Minute,
   999  		CharmRevisionUpdateInterval: 24 * time.Hour,
  1000  		EntityStatusHistoryCount:    100,
  1001  		EntityStatusHistoryInterval: 5 * time.Minute,
  1002  		SpacesImportedGate:          a.discoverSpacesComplete,
  1003  	})
  1004  	if err := dependency.Install(engine, manifolds); err != nil {
  1005  		if err := worker.Stop(engine); err != nil {
  1006  			logger.Errorf("while stopping engine with bad manifolds: %v", err)
  1007  		}
  1008  		return nil, errors.Trace(err)
  1009  	}
  1010  	return engine, nil
  1011  }
  1012  
  1013  // stateWorkerDialOpts is a mongo.DialOpts suitable
  1014  // for use by StateWorker to dial mongo.
  1015  //
  1016  // This must be overridden in tests, as it assumes
  1017  // journaling is enabled.
  1018  var stateWorkerDialOpts mongo.DialOpts
  1019  
  1020  func (a *MachineAgent) apiserverWorkerStarter(
  1021  	stateOpener func() (*state.State, error), certChanged chan params.StateServingInfo,
  1022  ) func() (worker.Worker, error) {
  1023  	return func() (worker.Worker, error) {
  1024  		st, err := stateOpener()
  1025  		if err != nil {
  1026  			return nil, errors.Trace(err)
  1027  		}
  1028  		return a.newApiserverWorker(st, certChanged)
  1029  	}
  1030  }
  1031  
  1032  func (a *MachineAgent) newApiserverWorker(st *state.State, certChanged chan params.StateServingInfo) (worker.Worker, error) {
  1033  	agentConfig := a.CurrentConfig()
  1034  	// If the configuration does not have the required information,
  1035  	// it is currently not a recoverable error, so we kill the whole
  1036  	// agent, potentially enabling human intervention to fix
  1037  	// the agent's configuration file.
  1038  	info, ok := agentConfig.StateServingInfo()
  1039  	if !ok {
  1040  		return nil, &cmdutil.FatalError{"StateServingInfo not available and we need it"}
  1041  	}
  1042  	cert := []byte(info.Cert)
  1043  	key := []byte(info.PrivateKey)
  1044  
  1045  	if len(cert) == 0 || len(key) == 0 {
  1046  		return nil, &cmdutil.FatalError{"configuration does not have controller cert/key"}
  1047  	}
  1048  	tag := agentConfig.Tag()
  1049  	dataDir := agentConfig.DataDir()
  1050  	logDir := agentConfig.LogDir()
  1051  
  1052  	endpoint := net.JoinHostPort("", strconv.Itoa(info.APIPort))
  1053  	listener, err := net.Listen("tcp", endpoint)
  1054  	if err != nil {
  1055  		return nil, err
  1056  	}
  1057  	w, err := apiserver.NewServer(st, listener, apiserver.ServerConfig{
  1058  		Cert:        cert,
  1059  		Key:         key,
  1060  		Tag:         tag,
  1061  		DataDir:     dataDir,
  1062  		LogDir:      logDir,
  1063  		Validator:   a.limitLogins,
  1064  		CertChanged: certChanged,
  1065  	})
  1066  	if err != nil {
  1067  		return nil, errors.Annotate(err, "cannot start api server worker")
  1068  	}
  1069  	return w, nil
  1070  }
  1071  
  1072  // limitLogins is called by the API server for each login attempt.
  1073  // it returns an error if upgrades or restore are running.
  1074  func (a *MachineAgent) limitLogins(req params.LoginRequest) error {
  1075  	if err := a.limitLoginsDuringRestore(req); err != nil {
  1076  		return err
  1077  	}
  1078  	if err := a.limitLoginsDuringUpgrade(req); err != nil {
  1079  		return err
  1080  	}
  1081  	return a.limitLoginsDuringMongoUpgrade(req)
  1082  }
  1083  
  1084  func (a *MachineAgent) limitLoginsDuringMongoUpgrade(req params.LoginRequest) error {
  1085  	// If upgrade is running we will not be able to lock AgentConfigWriter
  1086  	// and it also means we are not upgrading mongo.
  1087  	if a.isUpgradeRunning() {
  1088  		return nil
  1089  	}
  1090  	cfg := a.AgentConfigWriter.CurrentConfig()
  1091  	ver := cfg.MongoVersion()
  1092  	if ver == mongo.MongoUpgrade {
  1093  		return errors.New("Upgrading Mongo")
  1094  	}
  1095  	return nil
  1096  }
  1097  
  1098  // limitLoginsDuringRestore will only allow logins for restore related purposes
  1099  // while the different steps of restore are running.
  1100  func (a *MachineAgent) limitLoginsDuringRestore(req params.LoginRequest) error {
  1101  	var err error
  1102  	switch {
  1103  	case a.IsRestoreRunning():
  1104  		err = apiserver.RestoreInProgressError
  1105  	case a.IsRestorePreparing():
  1106  		err = apiserver.AboutToRestoreError
  1107  	}
  1108  	if err != nil {
  1109  		authTag, parseErr := names.ParseTag(req.AuthTag)
  1110  		if parseErr != nil {
  1111  			return errors.Annotate(err, "could not parse auth tag")
  1112  		}
  1113  		switch authTag := authTag.(type) {
  1114  		case names.UserTag:
  1115  			// use a restricted API mode
  1116  			return err
  1117  		case names.MachineTag:
  1118  			if authTag == a.Tag() {
  1119  				// allow logins from the local machine
  1120  				return nil
  1121  			}
  1122  		}
  1123  		return errors.Errorf("login for %q blocked because restore is in progress", authTag)
  1124  	}
  1125  	return nil
  1126  }
  1127  
  1128  // limitLoginsDuringUpgrade is called by the API server for each login
  1129  // attempt. It returns an error if upgrades are in progress unless the
  1130  // login is for a user (i.e. a client) or the local machine.
  1131  func (a *MachineAgent) limitLoginsDuringUpgrade(req params.LoginRequest) error {
  1132  	if a.isUpgradeRunning() || a.isInitialUpgradeCheckPending() {
  1133  		authTag, err := names.ParseTag(req.AuthTag)
  1134  		if err != nil {
  1135  			return errors.Annotate(err, "could not parse auth tag")
  1136  		}
  1137  		switch authTag := authTag.(type) {
  1138  		case names.UserTag:
  1139  			// use a restricted API mode
  1140  			return params.UpgradeInProgressError
  1141  		case names.MachineTag:
  1142  			if authTag == a.Tag() {
  1143  				// allow logins from the local machine
  1144  				return nil
  1145  			}
  1146  		}
  1147  		return errors.Errorf("login for %q blocked because %s", authTag, params.CodeUpgradeInProgress)
  1148  	} else {
  1149  		return nil // allow all logins
  1150  	}
  1151  }
  1152  
  1153  var stateWorkerServingConfigErr = errors.New("state worker started with no state serving info")
  1154  
  1155  // ensureMongoServer ensures that mongo is installed and running,
  1156  // and ready for opening a state connection.
  1157  func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) (err error) {
  1158  	a.mongoInitMutex.Lock()
  1159  	defer a.mongoInitMutex.Unlock()
  1160  	if a.mongoInitialized {
  1161  		logger.Debugf("mongo is already initialized")
  1162  		return nil
  1163  	}
  1164  	defer func() {
  1165  		if err == nil {
  1166  			a.mongoInitialized = true
  1167  		}
  1168  	}()
  1169  
  1170  	mongoInstalled, err := mongo.IsServiceInstalled()
  1171  	if err != nil {
  1172  		return errors.Annotate(err, "error while checking if mongodb service is installed")
  1173  	}
  1174  
  1175  	if !mongoInstalled {
  1176  		// EnsureMongoServer installs/upgrades the init config as necessary.
  1177  		ensureServerParams, err := cmdutil.NewEnsureServerParams(agentConfig)
  1178  		if err != nil {
  1179  			return err
  1180  		}
  1181  		if err := cmdutil.EnsureMongoServer(ensureServerParams); err != nil {
  1182  			return err
  1183  		}
  1184  	}
  1185  	logger.Debugf("mongodb service is installed")
  1186  
  1187  	// Mongo is installed, record the version.
  1188  	err = a.ChangeConfig(func(config agent.ConfigSetter) error {
  1189  		config.SetMongoVersion(mongo.InstalledVersion())
  1190  		return nil
  1191  	})
  1192  	if err != nil {
  1193  		return errors.Annotate(err, "cannot set mongo version")
  1194  	}
  1195  	return nil
  1196  }
  1197  
  1198  func openState(agentConfig agent.Config, dialOpts mongo.DialOpts) (_ *state.State, _ *state.Machine, err error) {
  1199  	info, ok := agentConfig.MongoInfo()
  1200  	if !ok {
  1201  		return nil, nil, fmt.Errorf("no state info available")
  1202  	}
  1203  	st, err := state.Open(agentConfig.Model(), info, dialOpts, environs.NewStatePolicy())
  1204  	if err != nil {
  1205  		return nil, nil, err
  1206  	}
  1207  	defer func() {
  1208  		if err != nil {
  1209  			st.Close()
  1210  		}
  1211  	}()
  1212  	m0, err := st.FindEntity(agentConfig.Tag())
  1213  	if err != nil {
  1214  		if errors.IsNotFound(err) {
  1215  			err = worker.ErrTerminateAgent
  1216  		}
  1217  		return nil, nil, err
  1218  	}
  1219  	m := m0.(*state.Machine)
  1220  	if m.Life() == state.Dead {
  1221  		return nil, nil, worker.ErrTerminateAgent
  1222  	}
  1223  	// Check the machine nonce as provisioned matches the agent.Conf value.
  1224  	if !m.CheckProvisioned(agentConfig.Nonce()) {
  1225  		// The agent is running on a different machine to the one it
  1226  		// should be according to state. It must stop immediately.
  1227  		logger.Errorf("running machine %v agent on inappropriate instance", m)
  1228  		return nil, nil, worker.ErrTerminateAgent
  1229  	}
  1230  	return st, m, nil
  1231  }
  1232  
  1233  func getMachine(st *state.State, tag names.Tag) (*state.Machine, error) {
  1234  	m0, err := st.FindEntity(tag)
  1235  	if err != nil {
  1236  		return nil, err
  1237  	}
  1238  	return m0.(*state.Machine), nil
  1239  }
  1240  
  1241  // startWorkerAfterUpgrade starts a worker to run the specified child worker
  1242  // but only after waiting for upgrades to complete.
  1243  func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) {
  1244  	runner.StartWorker(name, func() (worker.Worker, error) {
  1245  		return a.upgradeWaiterWorker(name, start), nil
  1246  	})
  1247  }
  1248  
  1249  // upgradeWaiterWorker runs the specified worker after upgrades have completed.
  1250  func (a *MachineAgent) upgradeWaiterWorker(name string, start func() (worker.Worker, error)) worker.Worker {
  1251  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
  1252  		// Wait for the agent upgrade and upgrade steps to complete (or for us to be stopped).
  1253  		for _, ch := range []<-chan struct{}{
  1254  			a.upgradeComplete.Unlocked(),
  1255  			a.initialUpgradeCheckComplete.Unlocked(),
  1256  		} {
  1257  			select {
  1258  			case <-stop:
  1259  				return nil
  1260  			case <-ch:
  1261  			}
  1262  		}
  1263  		logger.Debugf("upgrades done, starting worker %q", name)
  1264  
  1265  		// Upgrades are done, start the worker.
  1266  		worker, err := start()
  1267  		if err != nil {
  1268  			return err
  1269  		}
  1270  		// Wait for worker to finish or for us to be stopped.
  1271  		waitCh := make(chan error)
  1272  		go func() {
  1273  			waitCh <- worker.Wait()
  1274  		}()
  1275  		select {
  1276  		case err := <-waitCh:
  1277  			logger.Debugf("worker %q exited with %v", name, err)
  1278  			return err
  1279  		case <-stop:
  1280  			logger.Debugf("stopping so killing worker %q", name)
  1281  			worker.Kill()
  1282  		}
  1283  		return <-waitCh // Ensure worker has stopped before returning.
  1284  	})
  1285  }
  1286  
  1287  // WorkersStarted returns a channel that's closed once all top level workers
  1288  // have been started. This is provided for testing purposes.
  1289  func (a *MachineAgent) WorkersStarted() <-chan struct{} {
  1290  	return a.workersStarted
  1291  }
  1292  
  1293  func (a *MachineAgent) Tag() names.Tag {
  1294  	return names.NewMachineTag(a.machineId)
  1295  }
  1296  
  1297  func (a *MachineAgent) createJujudSymlinks(dataDir string) error {
  1298  	jujud := filepath.Join(tools.ToolsDir(dataDir, a.Tag().String()), jujunames.Jujud)
  1299  	for _, link := range []string{jujuRun, jujuDumpLogs} {
  1300  		err := a.createSymlink(jujud, link)
  1301  		if err != nil {
  1302  			return errors.Annotatef(err, "failed to create %s symlink", link)
  1303  		}
  1304  	}
  1305  	return nil
  1306  }
  1307  
  1308  func (a *MachineAgent) createSymlink(target, link string) error {
  1309  	fullLink := utils.EnsureBaseDir(a.rootDir, link)
  1310  
  1311  	currentTarget, err := symlink.Read(fullLink)
  1312  	if err != nil && !os.IsNotExist(err) {
  1313  		return err
  1314  	} else if err == nil {
  1315  		// Link already in place - check it.
  1316  		if currentTarget == target {
  1317  			// Link already points to the right place - nothing to do.
  1318  			return nil
  1319  		}
  1320  		// Link points to the wrong place - delete it.
  1321  		if err := os.Remove(fullLink); err != nil {
  1322  			return err
  1323  		}
  1324  	}
  1325  
  1326  	if err := os.MkdirAll(filepath.Dir(fullLink), os.FileMode(0755)); err != nil {
  1327  		return err
  1328  	}
  1329  	return symlink.New(target, fullLink)
  1330  }
  1331  
  1332  func (a *MachineAgent) removeJujudSymlinks() (errs []error) {
  1333  	for _, link := range []string{jujuRun, jujuDumpLogs} {
  1334  		err := os.Remove(utils.EnsureBaseDir(a.rootDir, link))
  1335  		if err != nil && !os.IsNotExist(err) {
  1336  			errs = append(errs, errors.Annotatef(err, "failed to remove %s symlink", link))
  1337  		}
  1338  	}
  1339  	return
  1340  }
  1341  
  1342  func (a *MachineAgent) uninstallAgent() error {
  1343  	// We should only uninstall if the uninstall file is present.
  1344  	if !agent.CanUninstall(a) {
  1345  		logger.Infof("ignoring uninstall request")
  1346  		return nil
  1347  	}
  1348  	logger.Infof("uninstalling agent")
  1349  
  1350  	agentConfig := a.CurrentConfig()
  1351  	var errs []error
  1352  	agentServiceName := agentConfig.Value(agent.AgentServiceName)
  1353  	if agentServiceName == "" {
  1354  		// For backwards compatibility, handle lack of AgentServiceName.
  1355  		agentServiceName = os.Getenv("UPSTART_JOB")
  1356  	}
  1357  
  1358  	if agentServiceName != "" {
  1359  		svc, err := service.DiscoverService(agentServiceName, common.Conf{})
  1360  		if err != nil {
  1361  			errs = append(errs, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err))
  1362  		} else if err := svc.Remove(); err != nil {
  1363  			errs = append(errs, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err))
  1364  		}
  1365  	}
  1366  
  1367  	errs = append(errs, a.removeJujudSymlinks()...)
  1368  
  1369  	// TODO(fwereade): surely this shouldn't be happening here? Once we're
  1370  	// at this point we should expect to be killed in short order; if this
  1371  	// work is remotely important we should be blocking machine death on
  1372  	// its completion.
  1373  	insideContainer := container.RunningInContainer()
  1374  	if insideContainer {
  1375  		// We're running inside LXC, so loop devices may leak. Detach
  1376  		// any loop devices that are backed by files on this machine.
  1377  		//
  1378  		// It is necessary to do this here as well as in container/lxc,
  1379  		// as container/lxc needs to check in the container's rootfs
  1380  		// to see if the loop device is attached to the container; that
  1381  		// will fail if the data-dir is removed first.
  1382  		if err := a.loopDeviceManager.DetachLoopDevices("/", agentConfig.DataDir()); err != nil {
  1383  			errs = append(errs, err)
  1384  		}
  1385  	}
  1386  
  1387  	if err := mongo.RemoveService(); err != nil {
  1388  		errs = append(errs, errors.Annotate(err, "cannot stop/remove mongo service"))
  1389  	}
  1390  	if err := os.RemoveAll(agentConfig.DataDir()); err != nil {
  1391  		errs = append(errs, err)
  1392  	}
  1393  	if len(errs) == 0 {
  1394  		return nil
  1395  	}
  1396  	return fmt.Errorf("uninstall failed: %v", errs)
  1397  }
  1398  
  1399  func newConnRunner(conns ...cmdutil.Pinger) worker.Runner {
  1400  	return worker.NewRunner(cmdutil.ConnectionIsFatal(logger, conns...), cmdutil.MoreImportant, worker.RestartDelay)
  1401  }
  1402  
  1403  type MongoSessioner interface {
  1404  	MongoSession() *mgo.Session
  1405  }
  1406  
  1407  func newSingularStateRunner(runner worker.Runner, st MongoSessioner, m *state.Machine) (worker.Runner, error) {
  1408  	singularStateConn := singularStateConn{st.MongoSession(), m}
  1409  	singularRunner, err := newSingularRunner(runner, singularStateConn)
  1410  	if err != nil {
  1411  		return nil, errors.Annotate(err, "cannot make singular State Runner")
  1412  	}
  1413  	return singularRunner, err
  1414  }
  1415  
  1416  // singularStateConn implements singular.Conn on
  1417  // top of a State connection.
  1418  type singularStateConn struct {
  1419  	session *mgo.Session
  1420  	machine *state.Machine
  1421  }
  1422  
  1423  func (c singularStateConn) IsMaster() (bool, error) {
  1424  	return mongo.IsMaster(c.session, c.machine)
  1425  }
  1426  
  1427  func (c singularStateConn) Ping() error {
  1428  	return c.session.Ping()
  1429  }
  1430  
  1431  func metricAPI(st api.Connection) (metricsmanager.MetricsManagerClient, error) {
  1432  	client, err := metricsmanager.NewClient(st)
  1433  	if err != nil {
  1434  		return nil, errors.Trace(err)
  1435  	}
  1436  	return client, nil
  1437  }
  1438  
  1439  // newDeployContext gives the tests the opportunity to create a deployer.Context
  1440  // that can be used for testing so as to avoid (1) deploying units to the system
  1441  // running the tests and (2) get access to the *State used internally, so that
  1442  // tests can be run without waiting for the 5s watcher refresh time to which we would
  1443  // otherwise be restricted.
  1444  var newDeployContext = func(st *apideployer.State, agentConfig agent.Config) deployer.Context {
  1445  	return deployer.NewSimpleContext(agentConfig, st)
  1446  }