github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/cmd/jujud/agent/machine.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package agent
     5  
     6  import (
     7  	"fmt"
     8  	"net"
     9  	"os"
    10  	"path/filepath"
    11  	"runtime"
    12  	"strconv"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/juju/cmd"
    18  	"github.com/juju/errors"
    19  	"github.com/juju/gnuflag"
    20  	"github.com/juju/juju/api"
    21  	apiagent "github.com/juju/juju/api/agent"
    22  	"github.com/juju/juju/api/base"
    23  	apimachiner "github.com/juju/juju/api/machiner"
    24  	"github.com/juju/juju/controller"
    25  	"github.com/juju/loggo"
    26  	"github.com/juju/replicaset"
    27  	"github.com/juju/utils"
    28  	"github.com/juju/utils/clock"
    29  	"github.com/juju/utils/featureflag"
    30  	"github.com/juju/utils/series"
    31  	"github.com/juju/utils/set"
    32  	"github.com/juju/utils/symlink"
    33  	"github.com/juju/utils/voyeur"
    34  	"github.com/juju/version"
    35  	"gopkg.in/juju/charmrepo.v2-unstable"
    36  	"gopkg.in/juju/names.v2"
    37  	"gopkg.in/mgo.v2"
    38  	"gopkg.in/natefinch/lumberjack.v2"
    39  	"gopkg.in/tomb.v1"
    40  
    41  	"github.com/juju/juju/agent"
    42  	"github.com/juju/juju/agent/tools"
    43  	apideployer "github.com/juju/juju/api/deployer"
    44  	"github.com/juju/juju/api/metricsmanager"
    45  	apiprovisioner "github.com/juju/juju/api/provisioner"
    46  	"github.com/juju/juju/apiserver"
    47  	"github.com/juju/juju/apiserver/observer"
    48  	"github.com/juju/juju/apiserver/params"
    49  	"github.com/juju/juju/audit"
    50  	"github.com/juju/juju/cert"
    51  	"github.com/juju/juju/cmd/jujud/agent/machine"
    52  	"github.com/juju/juju/cmd/jujud/agent/model"
    53  	"github.com/juju/juju/cmd/jujud/reboot"
    54  	cmdutil "github.com/juju/juju/cmd/jujud/util"
    55  	"github.com/juju/juju/container"
    56  	"github.com/juju/juju/container/kvm"
    57  	"github.com/juju/juju/environs"
    58  	"github.com/juju/juju/environs/simplestreams"
    59  	"github.com/juju/juju/instance"
    60  	jujunames "github.com/juju/juju/juju/names"
    61  	"github.com/juju/juju/juju/paths"
    62  	"github.com/juju/juju/mongo"
    63  	"github.com/juju/juju/service"
    64  	"github.com/juju/juju/service/common"
    65  	"github.com/juju/juju/state"
    66  	"github.com/juju/juju/state/multiwatcher"
    67  	"github.com/juju/juju/state/stateenvirons"
    68  	"github.com/juju/juju/storage/looputil"
    69  	"github.com/juju/juju/upgrades"
    70  	jujuversion "github.com/juju/juju/version"
    71  	"github.com/juju/juju/watcher"
    72  	"github.com/juju/juju/worker"
    73  	"github.com/juju/juju/worker/apicaller"
    74  	"github.com/juju/juju/worker/certupdater"
    75  	"github.com/juju/juju/worker/conv2state"
    76  	"github.com/juju/juju/worker/dblogpruner"
    77  	"github.com/juju/juju/worker/dependency"
    78  	"github.com/juju/juju/worker/deployer"
    79  	"github.com/juju/juju/worker/gate"
    80  	"github.com/juju/juju/worker/imagemetadataworker"
    81  	"github.com/juju/juju/worker/introspection"
    82  	"github.com/juju/juju/worker/logsender"
    83  	"github.com/juju/juju/worker/migrationmaster"
    84  	"github.com/juju/juju/worker/modelworkermanager"
    85  	"github.com/juju/juju/worker/mongoupgrader"
    86  	"github.com/juju/juju/worker/peergrouper"
    87  	"github.com/juju/juju/worker/provisioner"
    88  	"github.com/juju/juju/worker/singular"
    89  	"github.com/juju/juju/worker/txnpruner"
    90  	"github.com/juju/juju/worker/upgradesteps"
    91  )
    92  
    93  var (
    94  	logger       = loggo.GetLogger("juju.cmd.jujud")
    95  	jujuRun      = paths.MustSucceed(paths.JujuRun(series.HostSeries()))
    96  	jujuDumpLogs = paths.MustSucceed(paths.JujuDumpLogs(series.HostSeries()))
    97  
    98  	// The following are defined as variables to allow the tests to
    99  	// intercept calls to the functions. In every case, they should
   100  	// be expressed as explicit dependencies, but nobody has yet had
   101  	// the intestinal fortitude to untangle this package. Be that
   102  	// person! Juju Needs You.
   103  	useMultipleCPUs       = utils.UseMultipleCPUs
   104  	newSingularRunner     = singular.New
   105  	peergrouperNew        = peergrouper.New
   106  	newCertificateUpdater = certupdater.NewCertificateUpdater
   107  	newMetadataUpdater    = imagemetadataworker.NewWorker
   108  	newUpgradeMongoWorker = mongoupgrader.New
   109  	reportOpenedState     = func(*state.State) {}
   110  
   111  	modelManifolds   = model.Manifolds
   112  	machineManifolds = machine.Manifolds
   113  )
   114  
   115  // Variable to override in tests, default is true
   116  var ProductionMongoWriteConcern = true
   117  
   118  func init() {
   119  	stateWorkerDialOpts = mongo.DefaultDialOpts()
   120  	stateWorkerDialOpts.PostDial = func(session *mgo.Session) error {
   121  		safe := mgo.Safe{}
   122  		if ProductionMongoWriteConcern {
   123  			safe.J = true
   124  			_, err := replicaset.CurrentConfig(session)
   125  			if err == nil {
   126  				// set mongo to write-majority (writes only returned after
   127  				// replicated to a majority of replica-set members).
   128  				safe.WMode = "majority"
   129  			}
   130  		}
   131  		session.SetSafe(&safe)
   132  		return nil
   133  	}
   134  }
   135  
   136  // AgentInitializer handles initializing a type for use as a Jujud
   137  // agent.
   138  type AgentInitializer interface {
   139  	AddFlags(*gnuflag.FlagSet)
   140  	CheckArgs([]string) error
   141  }
   142  
   143  // AgentConfigWriter encapsulates disk I/O operations with the agent
   144  // config.
   145  type AgentConfigWriter interface {
   146  	// ReadConfig reads the config for the given tag from disk.
   147  	ReadConfig(tag string) error
   148  	// ChangeConfig executes the given agent.ConfigMutator in a
   149  	// thread-safe context.
   150  	ChangeConfig(agent.ConfigMutator) error
   151  	// CurrentConfig returns a copy of the in-memory agent config.
   152  	CurrentConfig() agent.Config
   153  }
   154  
   155  // NewMachineAgentCmd creates a Command which handles parsing
   156  // command-line arguments and instantiating and running a
   157  // MachineAgent.
   158  func NewMachineAgentCmd(
   159  	ctx *cmd.Context,
   160  	machineAgentFactory func(string) *MachineAgent,
   161  	agentInitializer AgentInitializer,
   162  	configFetcher AgentConfigWriter,
   163  ) cmd.Command {
   164  	return &machineAgentCmd{
   165  		ctx:                 ctx,
   166  		machineAgentFactory: machineAgentFactory,
   167  		agentInitializer:    agentInitializer,
   168  		currentConfig:       configFetcher,
   169  	}
   170  }
   171  
   172  type machineAgentCmd struct {
   173  	cmd.CommandBase
   174  
   175  	// This group of arguments is required.
   176  	agentInitializer    AgentInitializer
   177  	currentConfig       AgentConfigWriter
   178  	machineAgentFactory func(string) *MachineAgent
   179  	ctx                 *cmd.Context
   180  
   181  	// This group is for debugging purposes.
   182  	logToStdErr bool
   183  
   184  	// The following are set via command-line flags.
   185  	machineId string
   186  }
   187  
   188  // Init is called by the cmd system to initialize the structure for
   189  // running.
   190  func (a *machineAgentCmd) Init(args []string) error {
   191  
   192  	if !names.IsValidMachine(a.machineId) {
   193  		return errors.Errorf("--machine-id option must be set, and expects a non-negative integer")
   194  	}
   195  	if err := a.agentInitializer.CheckArgs(args); err != nil {
   196  		return err
   197  	}
   198  
   199  	// Due to changes in the logging, and needing to care about old
   200  	// models that have been upgraded, we need to explicitly remove the
   201  	// file writer if one has been added, otherwise we will get duplicate
   202  	// lines of all logging in the log file.
   203  	loggo.RemoveWriter("logfile")
   204  
   205  	if a.logToStdErr {
   206  		return nil
   207  	}
   208  
   209  	err := a.currentConfig.ReadConfig(names.NewMachineTag(a.machineId).String())
   210  	if err != nil {
   211  		return errors.Annotate(err, "cannot read agent configuration")
   212  	}
   213  
   214  	// the context's stderr is set as the loggo writer in github.com/juju/cmd/logging.go
   215  	a.ctx.Stderr = &lumberjack.Logger{
   216  		Filename:   agent.LogFilename(a.currentConfig.CurrentConfig()),
   217  		MaxSize:    300, // megabytes
   218  		MaxBackups: 2,
   219  	}
   220  
   221  	return nil
   222  }
   223  
   224  // Run instantiates a MachineAgent and runs it.
   225  func (a *machineAgentCmd) Run(c *cmd.Context) error {
   226  	machineAgent := a.machineAgentFactory(a.machineId)
   227  	return machineAgent.Run(c)
   228  }
   229  
   230  // SetFlags adds the requisite flags to run this command.
   231  func (a *machineAgentCmd) SetFlags(f *gnuflag.FlagSet) {
   232  	a.agentInitializer.AddFlags(f)
   233  	f.StringVar(&a.machineId, "machine-id", "", "id of the machine to run")
   234  }
   235  
   236  // Info returns usage information for the command.
   237  func (a *machineAgentCmd) Info() *cmd.Info {
   238  	return &cmd.Info{
   239  		Name:    "machine",
   240  		Purpose: "run a juju machine agent",
   241  	}
   242  }
   243  
   244  // MachineAgentFactoryFn returns a function which instantiates a
   245  // MachineAgent given a machineId.
   246  func MachineAgentFactoryFn(
   247  	agentConfWriter AgentConfigWriter,
   248  	bufferedLogs logsender.LogRecordCh,
   249  	rootDir string,
   250  ) func(string) *MachineAgent {
   251  	return func(machineId string) *MachineAgent {
   252  		return NewMachineAgent(
   253  			machineId,
   254  			agentConfWriter,
   255  			bufferedLogs,
   256  			worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant, worker.RestartDelay),
   257  			looputil.NewLoopDeviceManager(),
   258  			rootDir,
   259  		)
   260  	}
   261  }
   262  
   263  // NewMachineAgent instantiates a new MachineAgent.
   264  func NewMachineAgent(
   265  	machineId string,
   266  	agentConfWriter AgentConfigWriter,
   267  	bufferedLogs logsender.LogRecordCh,
   268  	runner worker.Runner,
   269  	loopDeviceManager looputil.LoopDeviceManager,
   270  	rootDir string,
   271  ) *MachineAgent {
   272  	return &MachineAgent{
   273  		machineId:                   machineId,
   274  		AgentConfigWriter:           agentConfWriter,
   275  		configChangedVal:            voyeur.NewValue(true),
   276  		bufferedLogs:                bufferedLogs,
   277  		workersStarted:              make(chan struct{}),
   278  		runner:                      runner,
   279  		rootDir:                     rootDir,
   280  		initialUpgradeCheckComplete: gate.NewLock(),
   281  		loopDeviceManager:           loopDeviceManager,
   282  	}
   283  }
   284  
   285  // MachineAgent is responsible for tying together all functionality
   286  // needed to orchestrate a Jujud instance which controls a machine.
   287  type MachineAgent struct {
   288  	AgentConfigWriter
   289  
   290  	tomb             tomb.Tomb
   291  	machineId        string
   292  	runner           worker.Runner
   293  	rootDir          string
   294  	bufferedLogs     logsender.LogRecordCh
   295  	configChangedVal *voyeur.Value
   296  	upgradeComplete  gate.Lock
   297  	workersStarted   chan struct{}
   298  
   299  	// XXX(fwereade): these smell strongly of goroutine-unsafeness.
   300  	restoreMode bool
   301  	restoring   bool
   302  
   303  	// Used to signal that the upgrade worker will not
   304  	// reboot the agent on startup because there are no
   305  	// longer any immediately pending agent upgrades.
   306  	initialUpgradeCheckComplete gate.Lock
   307  
   308  	discoverSpacesComplete gate.Lock
   309  
   310  	mongoInitMutex   sync.Mutex
   311  	mongoInitialized bool
   312  
   313  	loopDeviceManager looputil.LoopDeviceManager
   314  }
   315  
   316  // IsRestorePreparing returns bool representing if we are in restore mode
   317  // but not running restore.
   318  func (a *MachineAgent) IsRestorePreparing() bool {
   319  	return a.restoreMode && !a.restoring
   320  }
   321  
   322  // IsRestoreRunning returns bool representing if we are in restore mode
   323  // and running the actual restore process.
   324  func (a *MachineAgent) IsRestoreRunning() bool {
   325  	return a.restoring
   326  }
   327  
   328  func (a *MachineAgent) isUpgradeRunning() bool {
   329  	return !a.upgradeComplete.IsUnlocked()
   330  }
   331  
   332  func (a *MachineAgent) isInitialUpgradeCheckPending() bool {
   333  	return !a.initialUpgradeCheckComplete.IsUnlocked()
   334  }
   335  
   336  // Wait waits for the machine agent to finish.
   337  func (a *MachineAgent) Wait() error {
   338  	return a.tomb.Wait()
   339  }
   340  
   341  // Stop stops the machine agent.
   342  func (a *MachineAgent) Stop() error {
   343  	a.runner.Kill()
   344  	return a.tomb.Wait()
   345  }
   346  
   347  // upgradeCertificateDNSNames ensure that the controller certificate
   348  // recorded in the agent config and also mongo server.pem contains the
   349  // DNSNames entries required by Juju.
   350  func upgradeCertificateDNSNames(config agent.ConfigSetter) error {
   351  	si, ok := config.StateServingInfo()
   352  	if !ok || si.CAPrivateKey == "" {
   353  		// No certificate information exists yet, nothing to do.
   354  		return nil
   355  	}
   356  
   357  	// Validate the current certificate and private key pair, and then
   358  	// extract the current DNS names from the certificate. If the
   359  	// certificate validation fails, or it does not contain the DNS
   360  	// names we require, we will generate a new one.
   361  	var dnsNames set.Strings
   362  	serverCert, _, err := cert.ParseCertAndKey(si.Cert, si.PrivateKey)
   363  	if err != nil {
   364  		// The certificate is invalid, so create a new one.
   365  		logger.Infof("parsing certificate/key failed, will generate a new one: %v", err)
   366  		dnsNames = set.NewStrings()
   367  	} else {
   368  		dnsNames = set.NewStrings(serverCert.DNSNames...)
   369  	}
   370  
   371  	update := false
   372  	requiredDNSNames := []string{"local", "juju-apiserver", "juju-mongodb"}
   373  	for _, dnsName := range requiredDNSNames {
   374  		if dnsNames.Contains(dnsName) {
   375  			continue
   376  		}
   377  		dnsNames.Add(dnsName)
   378  		update = true
   379  	}
   380  	if !update {
   381  		return nil
   382  	}
   383  
   384  	// Write a new certificate to the mongo pem and agent config files.
   385  	si.Cert, si.PrivateKey, err = cert.NewDefaultServer(config.CACert(), si.CAPrivateKey, dnsNames.Values())
   386  	if err != nil {
   387  		return err
   388  	}
   389  	if err := mongo.UpdateSSLKey(config.DataDir(), si.Cert, si.PrivateKey); err != nil {
   390  		return err
   391  	}
   392  	config.SetStateServingInfo(si)
   393  	return nil
   394  }
   395  
   396  // Run runs a machine agent.
   397  func (a *MachineAgent) Run(*cmd.Context) error {
   398  
   399  	defer a.tomb.Done()
   400  	if err := a.ReadConfig(a.Tag().String()); err != nil {
   401  		return errors.Errorf("cannot read agent configuration: %v", err)
   402  	}
   403  
   404  	logger.Infof("machine agent %v start (%s [%s])", a.Tag(), jujuversion.Current, runtime.Compiler)
   405  	if flags := featureflag.String(); flags != "" {
   406  		logger.Warningf("developer feature flags enabled: %s", flags)
   407  	}
   408  	if err := introspection.WriteProfileFunctions(); err != nil {
   409  		// This isn't fatal, just annoying.
   410  		logger.Errorf("failed to write profile funcs: %v", err)
   411  	}
   412  
   413  	// Before doing anything else, we need to make sure the certificate generated for
   414  	// use by mongo to validate controller connections is correct. This needs to be done
   415  	// before any possible restart of the mongo service.
   416  	// See bug http://pad.lv/1434680
   417  	if err := a.AgentConfigWriter.ChangeConfig(upgradeCertificateDNSNames); err != nil {
   418  		return errors.Annotate(err, "error upgrading server certificate")
   419  	}
   420  
   421  	if upgradeComplete, err := upgradesteps.NewLock(a); err != nil {
   422  		return errors.Annotate(err, "error during creating upgrade completion channel")
   423  	} else {
   424  		a.upgradeComplete = upgradeComplete
   425  	}
   426  
   427  	agentConfig := a.CurrentConfig()
   428  	createEngine := a.makeEngineCreator(agentConfig.UpgradedToVersion())
   429  	charmrepo.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache")
   430  	if err := a.createJujudSymlinks(agentConfig.DataDir()); err != nil {
   431  		return err
   432  	}
   433  	a.runner.StartWorker("engine", createEngine)
   434  
   435  	// At this point, all workers will have been configured to start
   436  	close(a.workersStarted)
   437  	err := a.runner.Wait()
   438  	switch errors.Cause(err) {
   439  	case worker.ErrTerminateAgent:
   440  		err = a.uninstallAgent()
   441  	case worker.ErrRebootMachine:
   442  		logger.Infof("Caught reboot error")
   443  		err = a.executeRebootOrShutdown(params.ShouldReboot)
   444  	case worker.ErrShutdownMachine:
   445  		logger.Infof("Caught shutdown error")
   446  		err = a.executeRebootOrShutdown(params.ShouldShutdown)
   447  	}
   448  	err = cmdutil.AgentDone(logger, err)
   449  	a.tomb.Kill(err)
   450  	return err
   451  }
   452  
   453  func (a *MachineAgent) makeEngineCreator(previousAgentVersion version.Number) func() (worker.Worker, error) {
   454  	return func() (worker.Worker, error) {
   455  		config := dependency.EngineConfig{
   456  			IsFatal:     cmdutil.IsFatal,
   457  			WorstError:  cmdutil.MoreImportantError,
   458  			ErrorDelay:  3 * time.Second,
   459  			BounceDelay: 10 * time.Millisecond,
   460  		}
   461  		engine, err := dependency.NewEngine(config)
   462  		if err != nil {
   463  			return nil, err
   464  		}
   465  		manifolds := machineManifolds(machine.ManifoldsConfig{
   466  			PreviousAgentVersion: previousAgentVersion,
   467  			Agent:                agent.APIHostPortsSetter{Agent: a},
   468  			RootDir:              a.rootDir,
   469  			AgentConfigChanged:   a.configChangedVal,
   470  			UpgradeStepsLock:     a.upgradeComplete,
   471  			UpgradeCheckLock:     a.initialUpgradeCheckComplete,
   472  			OpenState:            a.initState,
   473  			OpenStateForUpgrade:  a.openStateForUpgrade,
   474  			StartStateWorkers:    a.startStateWorkers,
   475  			StartAPIWorkers:      a.startAPIWorkers,
   476  			PreUpgradeSteps:      upgrades.PreUpgradeSteps,
   477  			LogSource:            a.bufferedLogs,
   478  			NewDeployContext:     newDeployContext,
   479  			Clock:                clock.WallClock,
   480  			ValidateMigration:    a.validateMigration,
   481  		})
   482  		if err := dependency.Install(engine, manifolds); err != nil {
   483  			if err := worker.Stop(engine); err != nil {
   484  				logger.Errorf("while stopping engine with bad manifolds: %v", err)
   485  			}
   486  			return nil, err
   487  		}
   488  		if err := startIntrospection(introspectionConfig{
   489  			Agent:      a,
   490  			Engine:     engine,
   491  			WorkerFunc: introspection.NewWorker,
   492  		}); err != nil {
   493  			// If the introspection worker failed to start, we just log error
   494  			// but continue. It is very unlikely to happen in the real world
   495  			// as the only issue is connecting to the abstract domain socket
   496  			// and the agent is controlled by by the OS to only have one.
   497  			logger.Errorf("failed to start introspection worker: %v", err)
   498  		}
   499  		return engine, nil
   500  	}
   501  }
   502  
   503  func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error {
   504  	// At this stage, all API connections would have been closed
   505  	// We need to reopen the API to clear the reboot flag after
   506  	// scheduling the reboot. It may be cleaner to do this in the reboot
   507  	// worker, before returning the ErrRebootMachine.
   508  	conn, err := apicaller.OnlyConnect(a, api.Open)
   509  	if err != nil {
   510  		logger.Infof("Reboot: Error connecting to state")
   511  		return errors.Trace(err)
   512  	}
   513  
   514  	// block until all units/containers are ready, and reboot/shutdown
   515  	finalize, err := reboot.NewRebootWaiter(conn, a.CurrentConfig())
   516  	if err != nil {
   517  		return errors.Trace(err)
   518  	}
   519  
   520  	logger.Infof("Reboot: Executing reboot")
   521  	err = finalize.ExecuteReboot(action)
   522  	if err != nil {
   523  		logger.Infof("Reboot: Error executing reboot: %v", err)
   524  		return errors.Trace(err)
   525  	}
   526  	// On windows, the shutdown command is asynchronous. We return ErrRebootMachine
   527  	// so the agent will simply exit without error pending reboot/shutdown.
   528  	return worker.ErrRebootMachine
   529  }
   530  
   531  func (a *MachineAgent) ChangeConfig(mutate agent.ConfigMutator) error {
   532  	err := a.AgentConfigWriter.ChangeConfig(mutate)
   533  	a.configChangedVal.Set(true)
   534  	return errors.Trace(err)
   535  }
   536  
   537  func (a *MachineAgent) maybeStopMongo(ver mongo.Version, isMaster bool) error {
   538  	if !a.mongoInitialized {
   539  		return nil
   540  	}
   541  
   542  	conf := a.AgentConfigWriter.CurrentConfig()
   543  	v := conf.MongoVersion()
   544  
   545  	logger.Errorf("Got version change %v", ver)
   546  	// TODO(perrito666) replace with "read-only" mode for environment when
   547  	// it is available.
   548  	if ver.NewerThan(v) > 0 {
   549  		err := a.AgentConfigWriter.ChangeConfig(func(config agent.ConfigSetter) error {
   550  			config.SetMongoVersion(mongo.MongoUpgrade)
   551  			return nil
   552  		})
   553  		if err != nil {
   554  			return err
   555  		}
   556  
   557  	}
   558  	return nil
   559  
   560  }
   561  
   562  // PrepareRestore will flag the agent to allow only a limited set
   563  // of commands defined in
   564  // "github.com/juju/juju/apiserver".allowedMethodsAboutToRestore
   565  // the most noteworthy is:
   566  // Backups.Restore: this will ensure that we can do all the file movements
   567  // required for restore and no one will do changes while we do that.
   568  // it will return error if the machine is already in this state.
   569  func (a *MachineAgent) PrepareRestore() error {
   570  	if a.restoreMode {
   571  		return errors.Errorf("already in restore mode")
   572  	}
   573  	a.restoreMode = true
   574  	return nil
   575  }
   576  
   577  // BeginRestore will flag the agent to disallow all commands since
   578  // restore should be running and therefore making changes that
   579  // would override anything done.
   580  func (a *MachineAgent) BeginRestore() error {
   581  	switch {
   582  	case !a.restoreMode:
   583  		return errors.Errorf("not in restore mode, cannot begin restoration")
   584  	case a.restoring:
   585  		return errors.Errorf("already restoring")
   586  	}
   587  	a.restoring = true
   588  	return nil
   589  }
   590  
   591  // EndRestore will flag the agent to allow all commands
   592  // This being invoked means that restore process failed
   593  // since success restarts the agent.
   594  func (a *MachineAgent) EndRestore() {
   595  	a.restoreMode = false
   596  	a.restoring = false
   597  }
   598  
   599  // newRestoreStateWatcherWorker will return a worker or err if there
   600  // is a failure, the worker takes care of watching the state of
   601  // restoreInfo doc and put the agent in the different restore modes.
   602  func (a *MachineAgent) newRestoreStateWatcherWorker(st *state.State) (worker.Worker, error) {
   603  	rWorker := func(stopch <-chan struct{}) error {
   604  		return a.restoreStateWatcher(st, stopch)
   605  	}
   606  	return worker.NewSimpleWorker(rWorker), nil
   607  }
   608  
   609  // restoreChanged will be called whenever restoreInfo doc changes signaling a new
   610  // step in the restore process.
   611  func (a *MachineAgent) restoreChanged(st *state.State) error {
   612  	status, err := st.RestoreInfo().Status()
   613  	if err != nil {
   614  		return errors.Annotate(err, "cannot read restore state")
   615  	}
   616  	switch status {
   617  	case state.RestorePending:
   618  		a.PrepareRestore()
   619  	case state.RestoreInProgress:
   620  		a.BeginRestore()
   621  	case state.RestoreFailed:
   622  		a.EndRestore()
   623  	}
   624  	return nil
   625  }
   626  
   627  // restoreStateWatcher watches for restoreInfo looking for changes in the restore process.
   628  func (a *MachineAgent) restoreStateWatcher(st *state.State, stopch <-chan struct{}) error {
   629  	restoreWatch := st.WatchRestoreInfoChanges()
   630  	defer func() {
   631  		restoreWatch.Kill()
   632  		restoreWatch.Wait()
   633  	}()
   634  
   635  	for {
   636  		select {
   637  		case <-restoreWatch.Changes():
   638  			if err := a.restoreChanged(st); err != nil {
   639  				return err
   640  			}
   641  		case <-stopch:
   642  			return nil
   643  		}
   644  	}
   645  }
   646  
   647  var newEnvirons = environs.New
   648  
   649  // startAPIWorkers is called to start workers which rely on the
   650  // machine agent's API connection (via the apiworkers manifold). It
   651  // returns a Runner with a number of workers attached to it.
   652  //
   653  // The workers started here need to be converted to run under the
   654  // dependency engine. Once they have all been converted, this method -
   655  // and the apiworkers manifold - can be removed.
   656  func (a *MachineAgent) startAPIWorkers(apiConn api.Connection) (_ worker.Worker, outErr error) {
   657  	agentConfig := a.CurrentConfig()
   658  
   659  	entity, err := apiagent.NewState(apiConn).Entity(a.Tag())
   660  	if err != nil {
   661  		return nil, errors.Trace(err)
   662  	}
   663  
   664  	var isModelManager bool
   665  	for _, job := range entity.Jobs() {
   666  		switch job {
   667  		case multiwatcher.JobManageModel:
   668  			isModelManager = true
   669  		default:
   670  			// TODO(dimitern): Once all workers moved over to using
   671  			// the API, report "unknown job type" here.
   672  		}
   673  	}
   674  
   675  	runner := worker.NewRunner(
   676  		cmdutil.ConnectionIsFatal(logger, apiConn),
   677  		cmdutil.MoreImportant,
   678  		worker.RestartDelay,
   679  	)
   680  	defer func() {
   681  		// If startAPIWorkers exits early with an error, stop the
   682  		// runner so that any already started runners aren't leaked.
   683  		if outErr != nil {
   684  			worker.Stop(runner)
   685  		}
   686  	}()
   687  
   688  	// Perform the operations needed to set up hosting for containers.
   689  	if err := a.setupContainerSupport(runner, apiConn, agentConfig); err != nil {
   690  		cause := errors.Cause(err)
   691  		if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent {
   692  			return nil, worker.ErrTerminateAgent
   693  		}
   694  		return nil, errors.Errorf("setting up container support: %v", err)
   695  	}
   696  
   697  	if isModelManager {
   698  
   699  		// Published image metadata for some providers are in simple streams.
   700  		// Providers that do not depend on simple streams do not need this worker.
   701  		env, err := environs.GetEnviron(apiagent.NewState(apiConn), newEnvirons)
   702  		if err != nil {
   703  			return nil, errors.Annotate(err, "getting environ")
   704  		}
   705  		if _, ok := env.(simplestreams.HasRegion); ok {
   706  			// Start worker that stores published image metadata in state.
   707  			runner.StartWorker("imagemetadata", func() (worker.Worker, error) {
   708  				return newMetadataUpdater(apiConn.MetadataUpdater()), nil
   709  			})
   710  		}
   711  
   712  		// We don't have instance info set and the network config for the
   713  		// bootstrap machine only, so update it now. All the other machines will
   714  		// have instance info including network config set at provisioning time.
   715  		if err := a.setControllerNetworkConfig(apiConn); err != nil {
   716  			return nil, errors.Annotate(err, "setting controller network config")
   717  		}
   718  	} else {
   719  		runner.StartWorker("stateconverter", func() (worker.Worker, error) {
   720  			// TODO(fwereade): this worker needs its own facade.
   721  			facade := apimachiner.NewState(apiConn)
   722  			handler := conv2state.New(facade, a)
   723  			w, err := watcher.NewNotifyWorker(watcher.NotifyConfig{
   724  				Handler: handler,
   725  			})
   726  			if err != nil {
   727  				return nil, errors.Annotate(err, "cannot start controller promoter worker")
   728  			}
   729  			return w, nil
   730  		})
   731  	}
   732  	return runner, nil
   733  }
   734  
   735  func (a *MachineAgent) setControllerNetworkConfig(apiConn api.Connection) error {
   736  	machinerAPI := apimachiner.NewState(apiConn)
   737  	agentConfig := a.CurrentConfig()
   738  
   739  	tag := agentConfig.Tag().(names.MachineTag)
   740  	machine, err := machinerAPI.Machine(tag)
   741  	if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead {
   742  		return worker.ErrTerminateAgent
   743  	}
   744  	if err != nil {
   745  		return errors.Annotatef(err, "cannot load machine %s from state", tag)
   746  	}
   747  
   748  	if err := machine.SetProviderNetworkConfig(); err != nil {
   749  		return errors.Annotate(err, "cannot set controller provider network config")
   750  	}
   751  	return nil
   752  }
   753  
   754  // Restart restarts the agent's service.
   755  func (a *MachineAgent) Restart() error {
   756  	name := a.CurrentConfig().Value(agent.AgentServiceName)
   757  	return service.Restart(name)
   758  }
   759  
   760  // openStateForUpgrade exists to be passed into the upgradesteps
   761  // worker. The upgradesteps worker opens state independently of the
   762  // state worker so that it isn't affected by the state worker's
   763  // lifetime. It ensures the MongoDB server is configured and started,
   764  // and then opens a state connection.
   765  //
   766  // TODO(mjs)- review the need for this once the dependency engine is
   767  // in use. Why can't upgradesteps depend on the main state connection?
   768  func (a *MachineAgent) openStateForUpgrade() (*state.State, error) {
   769  	agentConfig := a.CurrentConfig()
   770  	if err := a.ensureMongoServer(agentConfig); err != nil {
   771  		return nil, errors.Trace(err)
   772  	}
   773  	info, ok := agentConfig.MongoInfo()
   774  	if !ok {
   775  		return nil, errors.New("no state info available")
   776  	}
   777  	st, err := state.Open(agentConfig.Model(), agentConfig.Controller(), info, mongo.DefaultDialOpts(),
   778  		stateenvirons.GetNewPolicyFunc(
   779  			stateenvirons.GetNewEnvironFunc(environs.New),
   780  		),
   781  	)
   782  	if err != nil {
   783  		return nil, errors.Trace(err)
   784  	}
   785  	return st, nil
   786  }
   787  
   788  // validateMigration is called by the migrationminion to help check
   789  // that the agent will be ok when connected to a new controller.
   790  func (a *MachineAgent) validateMigration(apiCaller base.APICaller) error {
   791  	// TODO(mjs) - more extensive checks to come.
   792  	facade := apimachiner.NewState(apiCaller)
   793  	_, err := facade.Machine(names.NewMachineTag(a.machineId))
   794  	return errors.Trace(err)
   795  }
   796  
   797  // setupContainerSupport determines what containers can be run on this machine and
   798  // initialises suitable infrastructure to support such containers.
   799  func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connection, agentConfig agent.Config) error {
   800  	var supportedContainers []instance.ContainerType
   801  	supportsContainers := container.ContainersSupported()
   802  	if supportsContainers {
   803  		supportedContainers = append(supportedContainers, instance.LXD)
   804  	}
   805  
   806  	supportsKvm, err := kvm.IsKVMSupported()
   807  	if err != nil {
   808  		logger.Warningf("determining kvm support: %v\nno kvm containers possible", err)
   809  	}
   810  	if err == nil && supportsKvm {
   811  		supportedContainers = append(supportedContainers, instance.KVM)
   812  	}
   813  
   814  	return a.updateSupportedContainers(runner, st, supportedContainers, agentConfig)
   815  }
   816  
   817  // updateSupportedContainers records in state that a machine can run the specified containers.
   818  // It starts a watcher and when a container of a given type is first added to the machine,
   819  // the watcher is killed, the machine is set up to be able to start containers of the given type,
   820  // and a suitable provisioner is started.
   821  func (a *MachineAgent) updateSupportedContainers(
   822  	runner worker.Runner,
   823  	st api.Connection,
   824  	containers []instance.ContainerType,
   825  	agentConfig agent.Config,
   826  ) error {
   827  	pr := apiprovisioner.NewState(st)
   828  	tag := agentConfig.Tag().(names.MachineTag)
   829  	machine, err := pr.Machine(tag)
   830  	if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead {
   831  		return worker.ErrTerminateAgent
   832  	}
   833  	if err != nil {
   834  		return errors.Annotatef(err, "cannot load machine %s from state", tag)
   835  	}
   836  	if len(containers) == 0 {
   837  		if err := machine.SupportsNoContainers(); err != nil {
   838  			return errors.Annotatef(err, "clearing supported containers for %s", tag)
   839  		}
   840  		return nil
   841  	}
   842  	if err := machine.SetSupportedContainers(containers...); err != nil {
   843  		return errors.Annotatef(err, "setting supported containers for %s", tag)
   844  	}
   845  	// Start the watcher to fire when a container is first requested on the machine.
   846  	watcherName := fmt.Sprintf("%s-container-watcher", machine.Id())
   847  	params := provisioner.ContainerSetupParams{
   848  		Runner:              runner,
   849  		WorkerName:          watcherName,
   850  		SupportedContainers: containers,
   851  		Machine:             machine,
   852  		Provisioner:         pr,
   853  		Config:              agentConfig,
   854  		InitLockName:        agent.MachineLockName,
   855  	}
   856  	handler := provisioner.NewContainerSetupHandler(params)
   857  	a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) {
   858  		w, err := watcher.NewStringsWorker(watcher.StringsConfig{
   859  			Handler: handler,
   860  		})
   861  		if err != nil {
   862  			return nil, errors.Annotatef(err, "cannot start %s worker", watcherName)
   863  		}
   864  		return w, nil
   865  	})
   866  	return nil
   867  }
   868  
   869  func (a *MachineAgent) initState(agentConfig agent.Config) (*state.State, error) {
   870  	// Start MongoDB server and dial.
   871  	if err := a.ensureMongoServer(agentConfig); err != nil {
   872  		return nil, err
   873  	}
   874  
   875  	st, _, err := openState(agentConfig, stateWorkerDialOpts)
   876  	if err != nil {
   877  		return nil, err
   878  	}
   879  
   880  	reportOpenedState(st)
   881  
   882  	return st, nil
   883  }
   884  
   885  // startStateWorkers returns a worker running all the workers that
   886  // require a *state.State connection.
   887  func (a *MachineAgent) startStateWorkers(st *state.State) (worker.Worker, error) {
   888  	agentConfig := a.CurrentConfig()
   889  
   890  	m, err := getMachine(st, agentConfig.Tag())
   891  	if err != nil {
   892  		return nil, errors.Annotate(err, "machine lookup")
   893  	}
   894  
   895  	runner := worker.NewRunner(
   896  		cmdutil.PingerIsFatal(logger, st),
   897  		cmdutil.MoreImportant,
   898  		worker.RestartDelay,
   899  	)
   900  	singularRunner, err := newSingularStateRunner(runner, st, m)
   901  	if err != nil {
   902  		return nil, errors.Trace(err)
   903  	}
   904  
   905  	for _, job := range m.Jobs() {
   906  		switch job {
   907  		case state.JobHostUnits:
   908  			// Implemented elsewhere with workers that use the API.
   909  		case state.JobManageModel:
   910  			useMultipleCPUs()
   911  			a.startWorkerAfterUpgrade(runner, "model worker manager", func() (worker.Worker, error) {
   912  				w, err := modelworkermanager.New(modelworkermanager.Config{
   913  					ControllerUUID: st.ControllerUUID(),
   914  					Backend:        st,
   915  					NewWorker:      a.startModelWorkers,
   916  					ErrorDelay:     worker.RestartDelay,
   917  				})
   918  				if err != nil {
   919  					return nil, errors.Annotate(err, "cannot start model worker manager")
   920  				}
   921  				return w, nil
   922  			})
   923  			a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) {
   924  				env, err := stateenvirons.GetNewEnvironFunc(environs.New)(st)
   925  				if err != nil {
   926  					return nil, errors.Annotate(err, "getting environ from state")
   927  				}
   928  				supportsSpaces := environs.SupportsSpaces(env)
   929  				w, err := peergrouperNew(st, supportsSpaces)
   930  				if err != nil {
   931  					return nil, errors.Annotate(err, "cannot start peergrouper worker")
   932  				}
   933  				return w, nil
   934  			})
   935  			a.startWorkerAfterUpgrade(runner, "restore", func() (worker.Worker, error) {
   936  				w, err := a.newRestoreStateWatcherWorker(st)
   937  				if err != nil {
   938  					return nil, errors.Annotate(err, "cannot start backup-restorer worker")
   939  				}
   940  				return w, nil
   941  			})
   942  			a.startWorkerAfterUpgrade(runner, "mongoupgrade", func() (worker.Worker, error) {
   943  				return newUpgradeMongoWorker(st, a.machineId, a.maybeStopMongo)
   944  			})
   945  
   946  			// certChangedChan is shared by multiple workers it's up
   947  			// to the agent to close it rather than any one of the
   948  			// workers.  It is possible that multiple cert changes
   949  			// come in before the apiserver is up to receive them.
   950  			// Specify a bigger buffer to prevent deadlock when
   951  			// the apiserver isn't up yet.  Use a size of 10 since we
   952  			// allow up to 7 controllers, and might also update the
   953  			// addresses of the local machine (127.0.0.1, ::1, etc).
   954  			//
   955  			// TODO(cherylj/waigani) Remove this workaround when
   956  			// certupdater and apiserver can properly manage dependencies
   957  			// through the dependency engine.
   958  			//
   959  			// TODO(ericsnow) For now we simply do not close the channel.
   960  			certChangedChan := make(chan params.StateServingInfo, 10)
   961  			// Each time apiserver worker is restarted, we need a fresh copy of state due
   962  			// to the fact that state holds lease managers which are killed and need to be reset.
   963  			stateOpener := func() (*state.State, error) {
   964  				logger.Debugf("opening state for apiserver worker")
   965  				st, _, err := openState(agentConfig, stateWorkerDialOpts)
   966  				return st, err
   967  			}
   968  			runner.StartWorker("apiserver", a.apiserverWorkerStarter(stateOpener, certChangedChan))
   969  			var stateServingSetter certupdater.StateServingInfoSetter = func(info params.StateServingInfo, done <-chan struct{}) error {
   970  				return a.ChangeConfig(func(config agent.ConfigSetter) error {
   971  					config.SetStateServingInfo(info)
   972  					logger.Infof("update apiserver worker with new certificate")
   973  					select {
   974  					case certChangedChan <- info:
   975  						return nil
   976  					case <-done:
   977  						return nil
   978  					}
   979  				})
   980  			}
   981  			a.startWorkerAfterUpgrade(runner, "certupdater", func() (worker.Worker, error) {
   982  				return newCertificateUpdater(m, agentConfig, st, st, stateServingSetter), nil
   983  			})
   984  
   985  			a.startWorkerAfterUpgrade(singularRunner, "dblogpruner", func() (worker.Worker, error) {
   986  				return dblogpruner.New(st, dblogpruner.NewLogPruneParams()), nil
   987  			})
   988  
   989  			a.startWorkerAfterUpgrade(singularRunner, "txnpruner", func() (worker.Worker, error) {
   990  				return txnpruner.New(st, time.Hour*2, clock.WallClock), nil
   991  			})
   992  		default:
   993  			return nil, errors.Errorf("unknown job type %q", job)
   994  		}
   995  	}
   996  	return runner, nil
   997  }
   998  
   999  // startModelWorkers starts the set of workers that run for every model
  1000  // in each controller.
  1001  func (a *MachineAgent) startModelWorkers(controllerUUID, modelUUID string) (worker.Worker, error) {
  1002  	modelAgent, err := model.WrapAgent(a, controllerUUID, modelUUID)
  1003  	if err != nil {
  1004  		return nil, errors.Trace(err)
  1005  	}
  1006  
  1007  	engine, err := dependency.NewEngine(dependency.EngineConfig{
  1008  		IsFatal:     model.IsFatal,
  1009  		WorstError:  model.WorstError,
  1010  		Filter:      model.IgnoreErrRemoved,
  1011  		ErrorDelay:  3 * time.Second,
  1012  		BounceDelay: 10 * time.Millisecond,
  1013  	})
  1014  	if err != nil {
  1015  		return nil, errors.Trace(err)
  1016  	}
  1017  
  1018  	manifolds := modelManifolds(model.ManifoldsConfig{
  1019  		Agent:                       modelAgent,
  1020  		AgentConfigChanged:          a.configChangedVal,
  1021  		Clock:                       clock.WallClock,
  1022  		RunFlagDuration:             time.Minute,
  1023  		CharmRevisionUpdateInterval: 24 * time.Hour,
  1024  		InstPollerAggregationDelay:  3 * time.Second,
  1025  		// TODO(perrito666) the status history pruning numbers need
  1026  		// to be adjusting, after collecting user data from large install
  1027  		// bases, to numbers allowing a rich and useful back history.
  1028  		StatusHistoryPrunerMaxHistoryTime: 336 * time.Hour, // 2 weeks
  1029  		StatusHistoryPrunerMaxHistoryMB:   5120,            // 5G
  1030  		StatusHistoryPrunerInterval:       5 * time.Minute,
  1031  		SpacesImportedGate:                a.discoverSpacesComplete,
  1032  		NewEnvironFunc:                    newEnvirons,
  1033  		NewMigrationMaster:                migrationmaster.NewWorker,
  1034  	})
  1035  	if err := dependency.Install(engine, manifolds); err != nil {
  1036  		if err := worker.Stop(engine); err != nil {
  1037  			logger.Errorf("while stopping engine with bad manifolds: %v", err)
  1038  		}
  1039  		return nil, errors.Trace(err)
  1040  	}
  1041  	return engine, nil
  1042  }
  1043  
  1044  // stateWorkerDialOpts is a mongo.DialOpts suitable
  1045  // for use by StateWorker to dial mongo.
  1046  //
  1047  // This must be overridden in tests, as it assumes
  1048  // journaling is enabled.
  1049  var stateWorkerDialOpts mongo.DialOpts
  1050  
  1051  func (a *MachineAgent) apiserverWorkerStarter(
  1052  	stateOpener func() (*state.State, error), certChanged chan params.StateServingInfo,
  1053  ) func() (worker.Worker, error) {
  1054  	return func() (worker.Worker, error) {
  1055  		st, err := stateOpener()
  1056  		if err != nil {
  1057  			return nil, errors.Trace(err)
  1058  		}
  1059  		return a.newAPIserverWorker(st, certChanged)
  1060  	}
  1061  }
  1062  
  1063  func (a *MachineAgent) newAPIserverWorker(st *state.State, certChanged chan params.StateServingInfo) (worker.Worker, error) {
  1064  	agentConfig := a.CurrentConfig()
  1065  	// If the configuration does not have the required information,
  1066  	// it is currently not a recoverable error, so we kill the whole
  1067  	// agent, potentially enabling human intervention to fix
  1068  	// the agent's configuration file.
  1069  	info, ok := agentConfig.StateServingInfo()
  1070  	if !ok {
  1071  		return nil, &cmdutil.FatalError{"StateServingInfo not available and we need it"}
  1072  	}
  1073  	cert := info.Cert
  1074  	key := info.PrivateKey
  1075  
  1076  	if len(cert) == 0 || len(key) == 0 {
  1077  		return nil, &cmdutil.FatalError{"configuration does not have controller cert/key"}
  1078  	}
  1079  	tag := agentConfig.Tag()
  1080  	dataDir := agentConfig.DataDir()
  1081  	logDir := agentConfig.LogDir()
  1082  
  1083  	endpoint := net.JoinHostPort("", strconv.Itoa(info.APIPort))
  1084  	listener, err := net.Listen("tcp", endpoint)
  1085  	if err != nil {
  1086  		return nil, err
  1087  	}
  1088  
  1089  	// TODO(katco): We should be doing something more serious than
  1090  	// logging audit errors. Failures in the auditing systems should
  1091  	// stop the api server until the problem can be corrected.
  1092  	auditErrorHandler := func(err error) {
  1093  		logger.Criticalf("%v", err)
  1094  	}
  1095  
  1096  	controllerConfig, err := st.ControllerConfig()
  1097  	if err != nil {
  1098  		return nil, errors.Annotate(err, "cannot fetch the controller config")
  1099  	}
  1100  
  1101  	server, err := apiserver.NewServer(st, listener, apiserver.ServerConfig{
  1102  		Clock:            clock.WallClock,
  1103  		Cert:             cert,
  1104  		Key:              key,
  1105  		Tag:              tag,
  1106  		DataDir:          dataDir,
  1107  		LogDir:           logDir,
  1108  		Validator:        a.limitLogins,
  1109  		CertChanged:      certChanged,
  1110  		AutocertURL:      controllerConfig.AutocertURL(),
  1111  		AutocertDNSName:  controllerConfig.AutocertDNSName(),
  1112  		AllowModelAccess: controllerConfig.AllowModelAccess(),
  1113  		NewObserver: newObserverFn(
  1114  			controllerConfig,
  1115  			clock.WallClock,
  1116  			jujuversion.Current,
  1117  			agentConfig.Model().Id(),
  1118  			newAuditEntrySink(st, logDir),
  1119  			auditErrorHandler,
  1120  		),
  1121  	})
  1122  	if err != nil {
  1123  		return nil, errors.Annotate(err, "cannot start api server worker")
  1124  	}
  1125  
  1126  	return server, nil
  1127  }
  1128  
  1129  func newAuditEntrySink(st *state.State, logDir string) audit.AuditEntrySinkFn {
  1130  	persistFn := st.PutAuditEntryFn()
  1131  	fileSinkFn := audit.NewLogFileSink(logDir)
  1132  	return func(entry audit.AuditEntry) error {
  1133  		// We don't care about auditing anything but user actions.
  1134  		if _, err := names.ParseUserTag(entry.OriginName); err != nil {
  1135  			return nil
  1136  		}
  1137  		// TODO(wallyworld) - Pinger requests should not originate as a user action.
  1138  		if strings.HasPrefix(entry.Operation, "Pinger:") {
  1139  			return nil
  1140  		}
  1141  		persistErr := persistFn(entry)
  1142  		sinkErr := fileSinkFn(entry)
  1143  		if persistErr == nil {
  1144  			return errors.Annotate(sinkErr, "cannot save audit record to file")
  1145  		}
  1146  		if sinkErr == nil {
  1147  			return errors.Annotate(persistErr, "cannot save audit record to database")
  1148  		}
  1149  		return errors.Annotate(persistErr, "cannot save audit record to file or database")
  1150  	}
  1151  }
  1152  
  1153  func newObserverFn(
  1154  	controllerConfig controller.Config,
  1155  	clock clock.Clock,
  1156  	jujuServerVersion version.Number,
  1157  	modelUUID string,
  1158  	persistAuditEntry audit.AuditEntrySinkFn,
  1159  	auditErrorHandler observer.ErrorHandler,
  1160  ) observer.ObserverFactory {
  1161  
  1162  	var observerFactories []observer.ObserverFactory
  1163  
  1164  	// Common logging of RPC requests
  1165  	observerFactories = append(observerFactories, func() observer.Observer {
  1166  		logger := loggo.GetLogger("juju.apiserver")
  1167  		ctx := observer.RequestObserverContext{
  1168  			Clock:  clock,
  1169  			Logger: logger,
  1170  		}
  1171  		return observer.NewRequestObserver(ctx)
  1172  	})
  1173  
  1174  	// Auditing observer
  1175  	// TODO(katco): Auditing needs feature tests (lp:1604551)
  1176  	if controllerConfig.AuditingEnabled() {
  1177  		observerFactories = append(observerFactories, func() observer.Observer {
  1178  			ctx := &observer.AuditContext{
  1179  				JujuServerVersion: jujuServerVersion,
  1180  				ModelUUID:         modelUUID,
  1181  			}
  1182  			return observer.NewAudit(ctx, persistAuditEntry, auditErrorHandler)
  1183  		})
  1184  	}
  1185  
  1186  	return observer.ObserverFactoryMultiplexer(observerFactories...)
  1187  
  1188  }
  1189  
  1190  // limitLogins is called by the API server for each login attempt.
  1191  // it returns an error if upgrades or restore are running.
  1192  func (a *MachineAgent) limitLogins(req params.LoginRequest) error {
  1193  	if err := a.limitLoginsDuringRestore(req); err != nil {
  1194  		return err
  1195  	}
  1196  	if err := a.limitLoginsDuringUpgrade(req); err != nil {
  1197  		return err
  1198  	}
  1199  	return a.limitLoginsDuringMongoUpgrade(req)
  1200  }
  1201  
  1202  func (a *MachineAgent) limitLoginsDuringMongoUpgrade(req params.LoginRequest) error {
  1203  	// If upgrade is running we will not be able to lock AgentConfigWriter
  1204  	// and it also means we are not upgrading mongo.
  1205  	if a.isUpgradeRunning() {
  1206  		return nil
  1207  	}
  1208  	cfg := a.AgentConfigWriter.CurrentConfig()
  1209  	ver := cfg.MongoVersion()
  1210  	if ver == mongo.MongoUpgrade {
  1211  		return errors.New("Upgrading Mongo")
  1212  	}
  1213  	return nil
  1214  }
  1215  
  1216  // limitLoginsDuringRestore will only allow logins for restore related purposes
  1217  // while the different steps of restore are running.
  1218  func (a *MachineAgent) limitLoginsDuringRestore(req params.LoginRequest) error {
  1219  	var err error
  1220  	switch {
  1221  	case a.IsRestoreRunning():
  1222  		err = apiserver.RestoreInProgressError
  1223  	case a.IsRestorePreparing():
  1224  		err = apiserver.AboutToRestoreError
  1225  	}
  1226  	if err != nil {
  1227  		authTag, parseErr := names.ParseTag(req.AuthTag)
  1228  		if parseErr != nil {
  1229  			return errors.Annotate(err, "could not parse auth tag")
  1230  		}
  1231  		switch authTag := authTag.(type) {
  1232  		case names.UserTag:
  1233  			// use a restricted API mode
  1234  			return err
  1235  		case names.MachineTag:
  1236  			if authTag == a.Tag() {
  1237  				// allow logins from the local machine
  1238  				return nil
  1239  			}
  1240  		}
  1241  		return errors.Errorf("login for %q blocked because restore is in progress", authTag)
  1242  	}
  1243  	return nil
  1244  }
  1245  
  1246  // limitLoginsDuringUpgrade is called by the API server for each login
  1247  // attempt. It returns an error if upgrades are in progress unless the
  1248  // login is for a user (i.e. a client) or the local machine.
  1249  func (a *MachineAgent) limitLoginsDuringUpgrade(req params.LoginRequest) error {
  1250  	if a.isUpgradeRunning() || a.isInitialUpgradeCheckPending() {
  1251  		authTag, err := names.ParseTag(req.AuthTag)
  1252  		if err != nil {
  1253  			return errors.Annotate(err, "could not parse auth tag")
  1254  		}
  1255  		switch authTag := authTag.(type) {
  1256  		case names.UserTag:
  1257  			// use a restricted API mode
  1258  			return params.UpgradeInProgressError
  1259  		case names.MachineTag:
  1260  			if authTag == a.Tag() {
  1261  				// allow logins from the local machine
  1262  				return nil
  1263  			}
  1264  		}
  1265  		return errors.Errorf("login for %q blocked because %s", authTag, params.CodeUpgradeInProgress)
  1266  	} else {
  1267  		return nil // allow all logins
  1268  	}
  1269  }
  1270  
  1271  var stateWorkerServingConfigErr = errors.New("state worker started with no state serving info")
  1272  
  1273  // ensureMongoServer ensures that mongo is installed and running,
  1274  // and ready for opening a state connection.
  1275  func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) (err error) {
  1276  	a.mongoInitMutex.Lock()
  1277  	defer a.mongoInitMutex.Unlock()
  1278  	if a.mongoInitialized {
  1279  		logger.Debugf("mongo is already initialized")
  1280  		return nil
  1281  	}
  1282  	defer func() {
  1283  		if err == nil {
  1284  			a.mongoInitialized = true
  1285  		}
  1286  	}()
  1287  
  1288  	mongoInstalled, err := mongo.IsServiceInstalled()
  1289  	if err != nil {
  1290  		return errors.Annotate(err, "error while checking if mongodb service is installed")
  1291  	}
  1292  
  1293  	if !mongoInstalled {
  1294  		// EnsureMongoServer installs/upgrades the init config as necessary.
  1295  		ensureServerParams, err := cmdutil.NewEnsureServerParams(agentConfig)
  1296  		if err != nil {
  1297  			return err
  1298  		}
  1299  		if err := cmdutil.EnsureMongoServer(ensureServerParams); err != nil {
  1300  			return err
  1301  		}
  1302  	}
  1303  	logger.Debugf("mongodb service is installed")
  1304  
  1305  	// Mongo is installed, record the version.
  1306  	err = a.ChangeConfig(func(config agent.ConfigSetter) error {
  1307  		config.SetMongoVersion(mongo.InstalledVersion())
  1308  		return nil
  1309  	})
  1310  	if err != nil {
  1311  		return errors.Annotate(err, "cannot set mongo version")
  1312  	}
  1313  	return nil
  1314  }
  1315  
  1316  func openState(agentConfig agent.Config, dialOpts mongo.DialOpts) (_ *state.State, _ *state.Machine, err error) {
  1317  	info, ok := agentConfig.MongoInfo()
  1318  	if !ok {
  1319  		return nil, nil, errors.Errorf("no state info available")
  1320  	}
  1321  	st, err := state.Open(agentConfig.Model(), agentConfig.Controller(), info, dialOpts,
  1322  		stateenvirons.GetNewPolicyFunc(
  1323  			stateenvirons.GetNewEnvironFunc(environs.New),
  1324  		),
  1325  	)
  1326  	if err != nil {
  1327  		return nil, nil, err
  1328  	}
  1329  	defer func() {
  1330  		if err != nil {
  1331  			st.Close()
  1332  		}
  1333  	}()
  1334  	m0, err := st.FindEntity(agentConfig.Tag())
  1335  	if err != nil {
  1336  		if errors.IsNotFound(err) {
  1337  			err = worker.ErrTerminateAgent
  1338  		}
  1339  		return nil, nil, err
  1340  	}
  1341  	m := m0.(*state.Machine)
  1342  	if m.Life() == state.Dead {
  1343  		return nil, nil, worker.ErrTerminateAgent
  1344  	}
  1345  	// Check the machine nonce as provisioned matches the agent.Conf value.
  1346  	if !m.CheckProvisioned(agentConfig.Nonce()) {
  1347  		// The agent is running on a different machine to the one it
  1348  		// should be according to state. It must stop immediately.
  1349  		logger.Errorf("running machine %v agent on inappropriate instance", m)
  1350  		return nil, nil, worker.ErrTerminateAgent
  1351  	}
  1352  	return st, m, nil
  1353  }
  1354  
  1355  func getMachine(st *state.State, tag names.Tag) (*state.Machine, error) {
  1356  	m0, err := st.FindEntity(tag)
  1357  	if err != nil {
  1358  		return nil, err
  1359  	}
  1360  	return m0.(*state.Machine), nil
  1361  }
  1362  
  1363  // startWorkerAfterUpgrade starts a worker to run the specified child worker
  1364  // but only after waiting for upgrades to complete.
  1365  func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) {
  1366  	runner.StartWorker(name, func() (worker.Worker, error) {
  1367  		return a.upgradeWaiterWorker(name, start), nil
  1368  	})
  1369  }
  1370  
  1371  // upgradeWaiterWorker runs the specified worker after upgrades have completed.
  1372  func (a *MachineAgent) upgradeWaiterWorker(name string, start func() (worker.Worker, error)) worker.Worker {
  1373  	return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
  1374  		// Wait for the agent upgrade and upgrade steps to complete (or for us to be stopped).
  1375  		for _, ch := range []<-chan struct{}{
  1376  			a.upgradeComplete.Unlocked(),
  1377  			a.initialUpgradeCheckComplete.Unlocked(),
  1378  		} {
  1379  			select {
  1380  			case <-stop:
  1381  				return nil
  1382  			case <-ch:
  1383  			}
  1384  		}
  1385  		logger.Debugf("upgrades done, starting worker %q", name)
  1386  
  1387  		// Upgrades are done, start the worker.
  1388  		w, err := start()
  1389  		if err != nil {
  1390  			return err
  1391  		}
  1392  		// Wait for worker to finish or for us to be stopped.
  1393  		done := make(chan error, 1)
  1394  		go func() {
  1395  			done <- w.Wait()
  1396  		}()
  1397  		select {
  1398  		case err := <-done:
  1399  			return errors.Annotatef(err, "worker %q exited", name)
  1400  		case <-stop:
  1401  			logger.Debugf("stopping so killing worker %q", name)
  1402  			return worker.Stop(w)
  1403  		}
  1404  	})
  1405  }
  1406  
  1407  // WorkersStarted returns a channel that's closed once all top level workers
  1408  // have been started. This is provided for testing purposes.
  1409  func (a *MachineAgent) WorkersStarted() <-chan struct{} {
  1410  	return a.workersStarted
  1411  }
  1412  
  1413  func (a *MachineAgent) Tag() names.Tag {
  1414  	return names.NewMachineTag(a.machineId)
  1415  }
  1416  
  1417  func (a *MachineAgent) createJujudSymlinks(dataDir string) error {
  1418  	jujud := filepath.Join(tools.ToolsDir(dataDir, a.Tag().String()), jujunames.Jujud)
  1419  	for _, link := range []string{jujuRun, jujuDumpLogs} {
  1420  		err := a.createSymlink(jujud, link)
  1421  		if err != nil {
  1422  			return errors.Annotatef(err, "failed to create %s symlink", link)
  1423  		}
  1424  	}
  1425  	return nil
  1426  }
  1427  
  1428  func (a *MachineAgent) createSymlink(target, link string) error {
  1429  	fullLink := utils.EnsureBaseDir(a.rootDir, link)
  1430  
  1431  	currentTarget, err := symlink.Read(fullLink)
  1432  	if err != nil && !os.IsNotExist(err) {
  1433  		return err
  1434  	} else if err == nil {
  1435  		// Link already in place - check it.
  1436  		if currentTarget == target {
  1437  			// Link already points to the right place - nothing to do.
  1438  			return nil
  1439  		}
  1440  		// Link points to the wrong place - delete it.
  1441  		if err := os.Remove(fullLink); err != nil {
  1442  			return err
  1443  		}
  1444  	}
  1445  
  1446  	if err := os.MkdirAll(filepath.Dir(fullLink), os.FileMode(0755)); err != nil {
  1447  		return err
  1448  	}
  1449  	return symlink.New(target, fullLink)
  1450  }
  1451  
  1452  func (a *MachineAgent) removeJujudSymlinks() (errs []error) {
  1453  	for _, link := range []string{jujuRun, jujuDumpLogs} {
  1454  		err := os.Remove(utils.EnsureBaseDir(a.rootDir, link))
  1455  		if err != nil && !os.IsNotExist(err) {
  1456  			errs = append(errs, errors.Annotatef(err, "failed to remove %s symlink", link))
  1457  		}
  1458  	}
  1459  	return
  1460  }
  1461  
  1462  func (a *MachineAgent) uninstallAgent() error {
  1463  	// We should only uninstall if the uninstall file is present.
  1464  	if !agent.CanUninstall(a) {
  1465  		logger.Infof("ignoring uninstall request")
  1466  		return nil
  1467  	}
  1468  	logger.Infof("uninstalling agent")
  1469  
  1470  	agentConfig := a.CurrentConfig()
  1471  	var errs []error
  1472  	agentServiceName := agentConfig.Value(agent.AgentServiceName)
  1473  	if agentServiceName == "" {
  1474  		// For backwards compatibility, handle lack of AgentServiceName.
  1475  		agentServiceName = os.Getenv("UPSTART_JOB")
  1476  	}
  1477  
  1478  	if agentServiceName != "" {
  1479  		svc, err := service.DiscoverService(agentServiceName, common.Conf{})
  1480  		if err != nil {
  1481  			errs = append(errs, errors.Errorf("cannot remove service %q: %v", agentServiceName, err))
  1482  		} else if err := svc.Remove(); err != nil {
  1483  			errs = append(errs, errors.Errorf("cannot remove service %q: %v", agentServiceName, err))
  1484  		}
  1485  	}
  1486  
  1487  	errs = append(errs, a.removeJujudSymlinks()...)
  1488  
  1489  	// TODO(fwereade): surely this shouldn't be happening here? Once we're
  1490  	// at this point we should expect to be killed in short order; if this
  1491  	// work is remotely important we should be blocking machine death on
  1492  	// its completion.
  1493  	insideContainer := container.RunningInContainer()
  1494  	if insideContainer {
  1495  		// We're running inside a container, so loop devices may leak. Detach
  1496  		// any loop devices that are backed by files on this machine.
  1497  		if err := a.loopDeviceManager.DetachLoopDevices("/", agentConfig.DataDir()); err != nil {
  1498  			errs = append(errs, err)
  1499  		}
  1500  	}
  1501  
  1502  	if err := mongo.RemoveService(); err != nil {
  1503  		errs = append(errs, errors.Annotate(err, "cannot stop/remove mongo service"))
  1504  	}
  1505  	if err := os.RemoveAll(agentConfig.DataDir()); err != nil {
  1506  		errs = append(errs, err)
  1507  	}
  1508  	if len(errs) == 0 {
  1509  		return nil
  1510  	}
  1511  	return errors.Errorf("uninstall failed: %v", errs)
  1512  }
  1513  
  1514  type MongoSessioner interface {
  1515  	MongoSession() *mgo.Session
  1516  }
  1517  
  1518  func newSingularStateRunner(runner worker.Runner, st MongoSessioner, m *state.Machine) (worker.Runner, error) {
  1519  	singularStateConn := singularStateConn{st.MongoSession(), m}
  1520  	singularRunner, err := newSingularRunner(runner, singularStateConn)
  1521  	if err != nil {
  1522  		return nil, errors.Annotate(err, "cannot make singular State Runner")
  1523  	}
  1524  	return singularRunner, err
  1525  }
  1526  
  1527  // singularStateConn implements singular.Conn on
  1528  // top of a State connection.
  1529  type singularStateConn struct {
  1530  	session *mgo.Session
  1531  	machine *state.Machine
  1532  }
  1533  
  1534  func (c singularStateConn) IsMaster() (bool, error) {
  1535  	return mongo.IsMaster(c.session, c.machine)
  1536  }
  1537  
  1538  func (c singularStateConn) Ping() error {
  1539  	return c.session.Ping()
  1540  }
  1541  
  1542  func metricAPI(st api.Connection) (metricsmanager.MetricsManagerClient, error) {
  1543  	client, err := metricsmanager.NewClient(st)
  1544  	if err != nil {
  1545  		return nil, errors.Trace(err)
  1546  	}
  1547  	return client, nil
  1548  }
  1549  
  1550  // newDeployContext gives the tests the opportunity to create a deployer.Context
  1551  // that can be used for testing so as to avoid (1) deploying units to the system
  1552  // running the tests and (2) get access to the *State used internally, so that
  1553  // tests can be run without waiting for the 5s watcher refresh time to which we would
  1554  // otherwise be restricted.
  1555  var newDeployContext = func(st *apideployer.State, agentConfig agent.Config) deployer.Context {
  1556  	return deployer.NewSimpleContext(agentConfig, st)
  1557  }