github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/cmd/jujud/agent/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package agent 5 6 import ( 7 "fmt" 8 "net" 9 "os" 10 "path/filepath" 11 "runtime" 12 "strconv" 13 "strings" 14 "sync" 15 "time" 16 17 "github.com/juju/cmd" 18 "github.com/juju/errors" 19 "github.com/juju/gnuflag" 20 "github.com/juju/juju/api" 21 apiagent "github.com/juju/juju/api/agent" 22 "github.com/juju/juju/api/base" 23 apimachiner "github.com/juju/juju/api/machiner" 24 "github.com/juju/juju/controller" 25 "github.com/juju/loggo" 26 "github.com/juju/replicaset" 27 "github.com/juju/utils" 28 "github.com/juju/utils/clock" 29 "github.com/juju/utils/featureflag" 30 "github.com/juju/utils/series" 31 "github.com/juju/utils/set" 32 "github.com/juju/utils/symlink" 33 "github.com/juju/utils/voyeur" 34 "github.com/juju/version" 35 "gopkg.in/juju/charmrepo.v2-unstable" 36 "gopkg.in/juju/names.v2" 37 "gopkg.in/mgo.v2" 38 "gopkg.in/natefinch/lumberjack.v2" 39 "gopkg.in/tomb.v1" 40 41 "github.com/juju/juju/agent" 42 "github.com/juju/juju/agent/tools" 43 apideployer "github.com/juju/juju/api/deployer" 44 "github.com/juju/juju/api/metricsmanager" 45 apiprovisioner "github.com/juju/juju/api/provisioner" 46 "github.com/juju/juju/apiserver" 47 "github.com/juju/juju/apiserver/observer" 48 "github.com/juju/juju/apiserver/params" 49 "github.com/juju/juju/audit" 50 "github.com/juju/juju/cert" 51 "github.com/juju/juju/cmd/jujud/agent/machine" 52 "github.com/juju/juju/cmd/jujud/agent/model" 53 "github.com/juju/juju/cmd/jujud/reboot" 54 cmdutil "github.com/juju/juju/cmd/jujud/util" 55 "github.com/juju/juju/container" 56 "github.com/juju/juju/container/kvm" 57 "github.com/juju/juju/environs" 58 "github.com/juju/juju/environs/simplestreams" 59 "github.com/juju/juju/instance" 60 jujunames "github.com/juju/juju/juju/names" 61 "github.com/juju/juju/juju/paths" 62 "github.com/juju/juju/mongo" 63 "github.com/juju/juju/service" 64 "github.com/juju/juju/service/common" 65 "github.com/juju/juju/state" 66 "github.com/juju/juju/state/multiwatcher" 67 "github.com/juju/juju/state/stateenvirons" 68 "github.com/juju/juju/storage/looputil" 69 "github.com/juju/juju/upgrades" 70 jujuversion "github.com/juju/juju/version" 71 "github.com/juju/juju/watcher" 72 "github.com/juju/juju/worker" 73 "github.com/juju/juju/worker/apicaller" 74 "github.com/juju/juju/worker/certupdater" 75 "github.com/juju/juju/worker/conv2state" 76 "github.com/juju/juju/worker/dblogpruner" 77 "github.com/juju/juju/worker/dependency" 78 "github.com/juju/juju/worker/deployer" 79 "github.com/juju/juju/worker/gate" 80 "github.com/juju/juju/worker/imagemetadataworker" 81 "github.com/juju/juju/worker/introspection" 82 "github.com/juju/juju/worker/logsender" 83 "github.com/juju/juju/worker/migrationmaster" 84 "github.com/juju/juju/worker/modelworkermanager" 85 "github.com/juju/juju/worker/mongoupgrader" 86 "github.com/juju/juju/worker/peergrouper" 87 "github.com/juju/juju/worker/provisioner" 88 "github.com/juju/juju/worker/singular" 89 "github.com/juju/juju/worker/txnpruner" 90 "github.com/juju/juju/worker/upgradesteps" 91 ) 92 93 var ( 94 logger = loggo.GetLogger("juju.cmd.jujud") 95 jujuRun = paths.MustSucceed(paths.JujuRun(series.HostSeries())) 96 jujuDumpLogs = paths.MustSucceed(paths.JujuDumpLogs(series.HostSeries())) 97 98 // The following are defined as variables to allow the tests to 99 // intercept calls to the functions. In every case, they should 100 // be expressed as explicit dependencies, but nobody has yet had 101 // the intestinal fortitude to untangle this package. Be that 102 // person! Juju Needs You. 103 useMultipleCPUs = utils.UseMultipleCPUs 104 newSingularRunner = singular.New 105 peergrouperNew = peergrouper.New 106 newCertificateUpdater = certupdater.NewCertificateUpdater 107 newMetadataUpdater = imagemetadataworker.NewWorker 108 newUpgradeMongoWorker = mongoupgrader.New 109 reportOpenedState = func(*state.State) {} 110 111 modelManifolds = model.Manifolds 112 machineManifolds = machine.Manifolds 113 ) 114 115 // Variable to override in tests, default is true 116 var ProductionMongoWriteConcern = true 117 118 func init() { 119 stateWorkerDialOpts = mongo.DefaultDialOpts() 120 stateWorkerDialOpts.PostDial = func(session *mgo.Session) error { 121 safe := mgo.Safe{} 122 if ProductionMongoWriteConcern { 123 safe.J = true 124 _, err := replicaset.CurrentConfig(session) 125 if err == nil { 126 // set mongo to write-majority (writes only returned after 127 // replicated to a majority of replica-set members). 128 safe.WMode = "majority" 129 } 130 } 131 session.SetSafe(&safe) 132 return nil 133 } 134 } 135 136 // AgentInitializer handles initializing a type for use as a Jujud 137 // agent. 138 type AgentInitializer interface { 139 AddFlags(*gnuflag.FlagSet) 140 CheckArgs([]string) error 141 } 142 143 // AgentConfigWriter encapsulates disk I/O operations with the agent 144 // config. 145 type AgentConfigWriter interface { 146 // ReadConfig reads the config for the given tag from disk. 147 ReadConfig(tag string) error 148 // ChangeConfig executes the given agent.ConfigMutator in a 149 // thread-safe context. 150 ChangeConfig(agent.ConfigMutator) error 151 // CurrentConfig returns a copy of the in-memory agent config. 152 CurrentConfig() agent.Config 153 } 154 155 // NewMachineAgentCmd creates a Command which handles parsing 156 // command-line arguments and instantiating and running a 157 // MachineAgent. 158 func NewMachineAgentCmd( 159 ctx *cmd.Context, 160 machineAgentFactory func(string) *MachineAgent, 161 agentInitializer AgentInitializer, 162 configFetcher AgentConfigWriter, 163 ) cmd.Command { 164 return &machineAgentCmd{ 165 ctx: ctx, 166 machineAgentFactory: machineAgentFactory, 167 agentInitializer: agentInitializer, 168 currentConfig: configFetcher, 169 } 170 } 171 172 type machineAgentCmd struct { 173 cmd.CommandBase 174 175 // This group of arguments is required. 176 agentInitializer AgentInitializer 177 currentConfig AgentConfigWriter 178 machineAgentFactory func(string) *MachineAgent 179 ctx *cmd.Context 180 181 // This group is for debugging purposes. 182 logToStdErr bool 183 184 // The following are set via command-line flags. 185 machineId string 186 } 187 188 // Init is called by the cmd system to initialize the structure for 189 // running. 190 func (a *machineAgentCmd) Init(args []string) error { 191 192 if !names.IsValidMachine(a.machineId) { 193 return errors.Errorf("--machine-id option must be set, and expects a non-negative integer") 194 } 195 if err := a.agentInitializer.CheckArgs(args); err != nil { 196 return err 197 } 198 199 // Due to changes in the logging, and needing to care about old 200 // models that have been upgraded, we need to explicitly remove the 201 // file writer if one has been added, otherwise we will get duplicate 202 // lines of all logging in the log file. 203 loggo.RemoveWriter("logfile") 204 205 if a.logToStdErr { 206 return nil 207 } 208 209 err := a.currentConfig.ReadConfig(names.NewMachineTag(a.machineId).String()) 210 if err != nil { 211 return errors.Annotate(err, "cannot read agent configuration") 212 } 213 214 // the context's stderr is set as the loggo writer in github.com/juju/cmd/logging.go 215 a.ctx.Stderr = &lumberjack.Logger{ 216 Filename: agent.LogFilename(a.currentConfig.CurrentConfig()), 217 MaxSize: 300, // megabytes 218 MaxBackups: 2, 219 } 220 221 return nil 222 } 223 224 // Run instantiates a MachineAgent and runs it. 225 func (a *machineAgentCmd) Run(c *cmd.Context) error { 226 machineAgent := a.machineAgentFactory(a.machineId) 227 return machineAgent.Run(c) 228 } 229 230 // SetFlags adds the requisite flags to run this command. 231 func (a *machineAgentCmd) SetFlags(f *gnuflag.FlagSet) { 232 a.agentInitializer.AddFlags(f) 233 f.StringVar(&a.machineId, "machine-id", "", "id of the machine to run") 234 } 235 236 // Info returns usage information for the command. 237 func (a *machineAgentCmd) Info() *cmd.Info { 238 return &cmd.Info{ 239 Name: "machine", 240 Purpose: "run a juju machine agent", 241 } 242 } 243 244 // MachineAgentFactoryFn returns a function which instantiates a 245 // MachineAgent given a machineId. 246 func MachineAgentFactoryFn( 247 agentConfWriter AgentConfigWriter, 248 bufferedLogs logsender.LogRecordCh, 249 rootDir string, 250 ) func(string) *MachineAgent { 251 return func(machineId string) *MachineAgent { 252 return NewMachineAgent( 253 machineId, 254 agentConfWriter, 255 bufferedLogs, 256 worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant, worker.RestartDelay), 257 looputil.NewLoopDeviceManager(), 258 rootDir, 259 ) 260 } 261 } 262 263 // NewMachineAgent instantiates a new MachineAgent. 264 func NewMachineAgent( 265 machineId string, 266 agentConfWriter AgentConfigWriter, 267 bufferedLogs logsender.LogRecordCh, 268 runner worker.Runner, 269 loopDeviceManager looputil.LoopDeviceManager, 270 rootDir string, 271 ) *MachineAgent { 272 return &MachineAgent{ 273 machineId: machineId, 274 AgentConfigWriter: agentConfWriter, 275 configChangedVal: voyeur.NewValue(true), 276 bufferedLogs: bufferedLogs, 277 workersStarted: make(chan struct{}), 278 runner: runner, 279 rootDir: rootDir, 280 initialUpgradeCheckComplete: gate.NewLock(), 281 loopDeviceManager: loopDeviceManager, 282 } 283 } 284 285 // MachineAgent is responsible for tying together all functionality 286 // needed to orchestrate a Jujud instance which controls a machine. 287 type MachineAgent struct { 288 AgentConfigWriter 289 290 tomb tomb.Tomb 291 machineId string 292 runner worker.Runner 293 rootDir string 294 bufferedLogs logsender.LogRecordCh 295 configChangedVal *voyeur.Value 296 upgradeComplete gate.Lock 297 workersStarted chan struct{} 298 299 // XXX(fwereade): these smell strongly of goroutine-unsafeness. 300 restoreMode bool 301 restoring bool 302 303 // Used to signal that the upgrade worker will not 304 // reboot the agent on startup because there are no 305 // longer any immediately pending agent upgrades. 306 initialUpgradeCheckComplete gate.Lock 307 308 discoverSpacesComplete gate.Lock 309 310 mongoInitMutex sync.Mutex 311 mongoInitialized bool 312 313 loopDeviceManager looputil.LoopDeviceManager 314 } 315 316 // IsRestorePreparing returns bool representing if we are in restore mode 317 // but not running restore. 318 func (a *MachineAgent) IsRestorePreparing() bool { 319 return a.restoreMode && !a.restoring 320 } 321 322 // IsRestoreRunning returns bool representing if we are in restore mode 323 // and running the actual restore process. 324 func (a *MachineAgent) IsRestoreRunning() bool { 325 return a.restoring 326 } 327 328 func (a *MachineAgent) isUpgradeRunning() bool { 329 return !a.upgradeComplete.IsUnlocked() 330 } 331 332 func (a *MachineAgent) isInitialUpgradeCheckPending() bool { 333 return !a.initialUpgradeCheckComplete.IsUnlocked() 334 } 335 336 // Wait waits for the machine agent to finish. 337 func (a *MachineAgent) Wait() error { 338 return a.tomb.Wait() 339 } 340 341 // Stop stops the machine agent. 342 func (a *MachineAgent) Stop() error { 343 a.runner.Kill() 344 return a.tomb.Wait() 345 } 346 347 // upgradeCertificateDNSNames ensure that the controller certificate 348 // recorded in the agent config and also mongo server.pem contains the 349 // DNSNames entries required by Juju. 350 func upgradeCertificateDNSNames(config agent.ConfigSetter) error { 351 si, ok := config.StateServingInfo() 352 if !ok || si.CAPrivateKey == "" { 353 // No certificate information exists yet, nothing to do. 354 return nil 355 } 356 357 // Validate the current certificate and private key pair, and then 358 // extract the current DNS names from the certificate. If the 359 // certificate validation fails, or it does not contain the DNS 360 // names we require, we will generate a new one. 361 var dnsNames set.Strings 362 serverCert, _, err := cert.ParseCertAndKey(si.Cert, si.PrivateKey) 363 if err != nil { 364 // The certificate is invalid, so create a new one. 365 logger.Infof("parsing certificate/key failed, will generate a new one: %v", err) 366 dnsNames = set.NewStrings() 367 } else { 368 dnsNames = set.NewStrings(serverCert.DNSNames...) 369 } 370 371 update := false 372 requiredDNSNames := []string{"local", "juju-apiserver", "juju-mongodb"} 373 for _, dnsName := range requiredDNSNames { 374 if dnsNames.Contains(dnsName) { 375 continue 376 } 377 dnsNames.Add(dnsName) 378 update = true 379 } 380 if !update { 381 return nil 382 } 383 384 // Write a new certificate to the mongo pem and agent config files. 385 si.Cert, si.PrivateKey, err = cert.NewDefaultServer(config.CACert(), si.CAPrivateKey, dnsNames.Values()) 386 if err != nil { 387 return err 388 } 389 if err := mongo.UpdateSSLKey(config.DataDir(), si.Cert, si.PrivateKey); err != nil { 390 return err 391 } 392 config.SetStateServingInfo(si) 393 return nil 394 } 395 396 // Run runs a machine agent. 397 func (a *MachineAgent) Run(*cmd.Context) error { 398 399 defer a.tomb.Done() 400 if err := a.ReadConfig(a.Tag().String()); err != nil { 401 return errors.Errorf("cannot read agent configuration: %v", err) 402 } 403 404 logger.Infof("machine agent %v start (%s [%s])", a.Tag(), jujuversion.Current, runtime.Compiler) 405 if flags := featureflag.String(); flags != "" { 406 logger.Warningf("developer feature flags enabled: %s", flags) 407 } 408 if err := introspection.WriteProfileFunctions(); err != nil { 409 // This isn't fatal, just annoying. 410 logger.Errorf("failed to write profile funcs: %v", err) 411 } 412 413 // Before doing anything else, we need to make sure the certificate generated for 414 // use by mongo to validate controller connections is correct. This needs to be done 415 // before any possible restart of the mongo service. 416 // See bug http://pad.lv/1434680 417 if err := a.AgentConfigWriter.ChangeConfig(upgradeCertificateDNSNames); err != nil { 418 return errors.Annotate(err, "error upgrading server certificate") 419 } 420 421 if upgradeComplete, err := upgradesteps.NewLock(a); err != nil { 422 return errors.Annotate(err, "error during creating upgrade completion channel") 423 } else { 424 a.upgradeComplete = upgradeComplete 425 } 426 427 agentConfig := a.CurrentConfig() 428 createEngine := a.makeEngineCreator(agentConfig.UpgradedToVersion()) 429 charmrepo.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache") 430 if err := a.createJujudSymlinks(agentConfig.DataDir()); err != nil { 431 return err 432 } 433 a.runner.StartWorker("engine", createEngine) 434 435 // At this point, all workers will have been configured to start 436 close(a.workersStarted) 437 err := a.runner.Wait() 438 switch errors.Cause(err) { 439 case worker.ErrTerminateAgent: 440 err = a.uninstallAgent() 441 case worker.ErrRebootMachine: 442 logger.Infof("Caught reboot error") 443 err = a.executeRebootOrShutdown(params.ShouldReboot) 444 case worker.ErrShutdownMachine: 445 logger.Infof("Caught shutdown error") 446 err = a.executeRebootOrShutdown(params.ShouldShutdown) 447 } 448 err = cmdutil.AgentDone(logger, err) 449 a.tomb.Kill(err) 450 return err 451 } 452 453 func (a *MachineAgent) makeEngineCreator(previousAgentVersion version.Number) func() (worker.Worker, error) { 454 return func() (worker.Worker, error) { 455 config := dependency.EngineConfig{ 456 IsFatal: cmdutil.IsFatal, 457 WorstError: cmdutil.MoreImportantError, 458 ErrorDelay: 3 * time.Second, 459 BounceDelay: 10 * time.Millisecond, 460 } 461 engine, err := dependency.NewEngine(config) 462 if err != nil { 463 return nil, err 464 } 465 manifolds := machineManifolds(machine.ManifoldsConfig{ 466 PreviousAgentVersion: previousAgentVersion, 467 Agent: agent.APIHostPortsSetter{Agent: a}, 468 RootDir: a.rootDir, 469 AgentConfigChanged: a.configChangedVal, 470 UpgradeStepsLock: a.upgradeComplete, 471 UpgradeCheckLock: a.initialUpgradeCheckComplete, 472 OpenState: a.initState, 473 OpenStateForUpgrade: a.openStateForUpgrade, 474 StartStateWorkers: a.startStateWorkers, 475 StartAPIWorkers: a.startAPIWorkers, 476 PreUpgradeSteps: upgrades.PreUpgradeSteps, 477 LogSource: a.bufferedLogs, 478 NewDeployContext: newDeployContext, 479 Clock: clock.WallClock, 480 ValidateMigration: a.validateMigration, 481 }) 482 if err := dependency.Install(engine, manifolds); err != nil { 483 if err := worker.Stop(engine); err != nil { 484 logger.Errorf("while stopping engine with bad manifolds: %v", err) 485 } 486 return nil, err 487 } 488 if err := startIntrospection(introspectionConfig{ 489 Agent: a, 490 Engine: engine, 491 WorkerFunc: introspection.NewWorker, 492 }); err != nil { 493 // If the introspection worker failed to start, we just log error 494 // but continue. It is very unlikely to happen in the real world 495 // as the only issue is connecting to the abstract domain socket 496 // and the agent is controlled by by the OS to only have one. 497 logger.Errorf("failed to start introspection worker: %v", err) 498 } 499 return engine, nil 500 } 501 } 502 503 func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error { 504 // At this stage, all API connections would have been closed 505 // We need to reopen the API to clear the reboot flag after 506 // scheduling the reboot. It may be cleaner to do this in the reboot 507 // worker, before returning the ErrRebootMachine. 508 conn, err := apicaller.OnlyConnect(a, api.Open) 509 if err != nil { 510 logger.Infof("Reboot: Error connecting to state") 511 return errors.Trace(err) 512 } 513 514 // block until all units/containers are ready, and reboot/shutdown 515 finalize, err := reboot.NewRebootWaiter(conn, a.CurrentConfig()) 516 if err != nil { 517 return errors.Trace(err) 518 } 519 520 logger.Infof("Reboot: Executing reboot") 521 err = finalize.ExecuteReboot(action) 522 if err != nil { 523 logger.Infof("Reboot: Error executing reboot: %v", err) 524 return errors.Trace(err) 525 } 526 // On windows, the shutdown command is asynchronous. We return ErrRebootMachine 527 // so the agent will simply exit without error pending reboot/shutdown. 528 return worker.ErrRebootMachine 529 } 530 531 func (a *MachineAgent) ChangeConfig(mutate agent.ConfigMutator) error { 532 err := a.AgentConfigWriter.ChangeConfig(mutate) 533 a.configChangedVal.Set(true) 534 return errors.Trace(err) 535 } 536 537 func (a *MachineAgent) maybeStopMongo(ver mongo.Version, isMaster bool) error { 538 if !a.mongoInitialized { 539 return nil 540 } 541 542 conf := a.AgentConfigWriter.CurrentConfig() 543 v := conf.MongoVersion() 544 545 logger.Errorf("Got version change %v", ver) 546 // TODO(perrito666) replace with "read-only" mode for environment when 547 // it is available. 548 if ver.NewerThan(v) > 0 { 549 err := a.AgentConfigWriter.ChangeConfig(func(config agent.ConfigSetter) error { 550 config.SetMongoVersion(mongo.MongoUpgrade) 551 return nil 552 }) 553 if err != nil { 554 return err 555 } 556 557 } 558 return nil 559 560 } 561 562 // PrepareRestore will flag the agent to allow only a limited set 563 // of commands defined in 564 // "github.com/juju/juju/apiserver".allowedMethodsAboutToRestore 565 // the most noteworthy is: 566 // Backups.Restore: this will ensure that we can do all the file movements 567 // required for restore and no one will do changes while we do that. 568 // it will return error if the machine is already in this state. 569 func (a *MachineAgent) PrepareRestore() error { 570 if a.restoreMode { 571 return errors.Errorf("already in restore mode") 572 } 573 a.restoreMode = true 574 return nil 575 } 576 577 // BeginRestore will flag the agent to disallow all commands since 578 // restore should be running and therefore making changes that 579 // would override anything done. 580 func (a *MachineAgent) BeginRestore() error { 581 switch { 582 case !a.restoreMode: 583 return errors.Errorf("not in restore mode, cannot begin restoration") 584 case a.restoring: 585 return errors.Errorf("already restoring") 586 } 587 a.restoring = true 588 return nil 589 } 590 591 // EndRestore will flag the agent to allow all commands 592 // This being invoked means that restore process failed 593 // since success restarts the agent. 594 func (a *MachineAgent) EndRestore() { 595 a.restoreMode = false 596 a.restoring = false 597 } 598 599 // newRestoreStateWatcherWorker will return a worker or err if there 600 // is a failure, the worker takes care of watching the state of 601 // restoreInfo doc and put the agent in the different restore modes. 602 func (a *MachineAgent) newRestoreStateWatcherWorker(st *state.State) (worker.Worker, error) { 603 rWorker := func(stopch <-chan struct{}) error { 604 return a.restoreStateWatcher(st, stopch) 605 } 606 return worker.NewSimpleWorker(rWorker), nil 607 } 608 609 // restoreChanged will be called whenever restoreInfo doc changes signaling a new 610 // step in the restore process. 611 func (a *MachineAgent) restoreChanged(st *state.State) error { 612 status, err := st.RestoreInfo().Status() 613 if err != nil { 614 return errors.Annotate(err, "cannot read restore state") 615 } 616 switch status { 617 case state.RestorePending: 618 a.PrepareRestore() 619 case state.RestoreInProgress: 620 a.BeginRestore() 621 case state.RestoreFailed: 622 a.EndRestore() 623 } 624 return nil 625 } 626 627 // restoreStateWatcher watches for restoreInfo looking for changes in the restore process. 628 func (a *MachineAgent) restoreStateWatcher(st *state.State, stopch <-chan struct{}) error { 629 restoreWatch := st.WatchRestoreInfoChanges() 630 defer func() { 631 restoreWatch.Kill() 632 restoreWatch.Wait() 633 }() 634 635 for { 636 select { 637 case <-restoreWatch.Changes(): 638 if err := a.restoreChanged(st); err != nil { 639 return err 640 } 641 case <-stopch: 642 return nil 643 } 644 } 645 } 646 647 var newEnvirons = environs.New 648 649 // startAPIWorkers is called to start workers which rely on the 650 // machine agent's API connection (via the apiworkers manifold). It 651 // returns a Runner with a number of workers attached to it. 652 // 653 // The workers started here need to be converted to run under the 654 // dependency engine. Once they have all been converted, this method - 655 // and the apiworkers manifold - can be removed. 656 func (a *MachineAgent) startAPIWorkers(apiConn api.Connection) (_ worker.Worker, outErr error) { 657 agentConfig := a.CurrentConfig() 658 659 entity, err := apiagent.NewState(apiConn).Entity(a.Tag()) 660 if err != nil { 661 return nil, errors.Trace(err) 662 } 663 664 var isModelManager bool 665 for _, job := range entity.Jobs() { 666 switch job { 667 case multiwatcher.JobManageModel: 668 isModelManager = true 669 default: 670 // TODO(dimitern): Once all workers moved over to using 671 // the API, report "unknown job type" here. 672 } 673 } 674 675 runner := worker.NewRunner( 676 cmdutil.ConnectionIsFatal(logger, apiConn), 677 cmdutil.MoreImportant, 678 worker.RestartDelay, 679 ) 680 defer func() { 681 // If startAPIWorkers exits early with an error, stop the 682 // runner so that any already started runners aren't leaked. 683 if outErr != nil { 684 worker.Stop(runner) 685 } 686 }() 687 688 // Perform the operations needed to set up hosting for containers. 689 if err := a.setupContainerSupport(runner, apiConn, agentConfig); err != nil { 690 cause := errors.Cause(err) 691 if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent { 692 return nil, worker.ErrTerminateAgent 693 } 694 return nil, errors.Errorf("setting up container support: %v", err) 695 } 696 697 if isModelManager { 698 699 // Published image metadata for some providers are in simple streams. 700 // Providers that do not depend on simple streams do not need this worker. 701 env, err := environs.GetEnviron(apiagent.NewState(apiConn), newEnvirons) 702 if err != nil { 703 return nil, errors.Annotate(err, "getting environ") 704 } 705 if _, ok := env.(simplestreams.HasRegion); ok { 706 // Start worker that stores published image metadata in state. 707 runner.StartWorker("imagemetadata", func() (worker.Worker, error) { 708 return newMetadataUpdater(apiConn.MetadataUpdater()), nil 709 }) 710 } 711 712 // We don't have instance info set and the network config for the 713 // bootstrap machine only, so update it now. All the other machines will 714 // have instance info including network config set at provisioning time. 715 if err := a.setControllerNetworkConfig(apiConn); err != nil { 716 return nil, errors.Annotate(err, "setting controller network config") 717 } 718 } else { 719 runner.StartWorker("stateconverter", func() (worker.Worker, error) { 720 // TODO(fwereade): this worker needs its own facade. 721 facade := apimachiner.NewState(apiConn) 722 handler := conv2state.New(facade, a) 723 w, err := watcher.NewNotifyWorker(watcher.NotifyConfig{ 724 Handler: handler, 725 }) 726 if err != nil { 727 return nil, errors.Annotate(err, "cannot start controller promoter worker") 728 } 729 return w, nil 730 }) 731 } 732 return runner, nil 733 } 734 735 func (a *MachineAgent) setControllerNetworkConfig(apiConn api.Connection) error { 736 machinerAPI := apimachiner.NewState(apiConn) 737 agentConfig := a.CurrentConfig() 738 739 tag := agentConfig.Tag().(names.MachineTag) 740 machine, err := machinerAPI.Machine(tag) 741 if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead { 742 return worker.ErrTerminateAgent 743 } 744 if err != nil { 745 return errors.Annotatef(err, "cannot load machine %s from state", tag) 746 } 747 748 if err := machine.SetProviderNetworkConfig(); err != nil { 749 return errors.Annotate(err, "cannot set controller provider network config") 750 } 751 return nil 752 } 753 754 // Restart restarts the agent's service. 755 func (a *MachineAgent) Restart() error { 756 name := a.CurrentConfig().Value(agent.AgentServiceName) 757 return service.Restart(name) 758 } 759 760 // openStateForUpgrade exists to be passed into the upgradesteps 761 // worker. The upgradesteps worker opens state independently of the 762 // state worker so that it isn't affected by the state worker's 763 // lifetime. It ensures the MongoDB server is configured and started, 764 // and then opens a state connection. 765 // 766 // TODO(mjs)- review the need for this once the dependency engine is 767 // in use. Why can't upgradesteps depend on the main state connection? 768 func (a *MachineAgent) openStateForUpgrade() (*state.State, error) { 769 agentConfig := a.CurrentConfig() 770 if err := a.ensureMongoServer(agentConfig); err != nil { 771 return nil, errors.Trace(err) 772 } 773 info, ok := agentConfig.MongoInfo() 774 if !ok { 775 return nil, errors.New("no state info available") 776 } 777 st, err := state.Open(agentConfig.Model(), agentConfig.Controller(), info, mongo.DefaultDialOpts(), 778 stateenvirons.GetNewPolicyFunc( 779 stateenvirons.GetNewEnvironFunc(environs.New), 780 ), 781 ) 782 if err != nil { 783 return nil, errors.Trace(err) 784 } 785 return st, nil 786 } 787 788 // validateMigration is called by the migrationminion to help check 789 // that the agent will be ok when connected to a new controller. 790 func (a *MachineAgent) validateMigration(apiCaller base.APICaller) error { 791 // TODO(mjs) - more extensive checks to come. 792 facade := apimachiner.NewState(apiCaller) 793 _, err := facade.Machine(names.NewMachineTag(a.machineId)) 794 return errors.Trace(err) 795 } 796 797 // setupContainerSupport determines what containers can be run on this machine and 798 // initialises suitable infrastructure to support such containers. 799 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connection, agentConfig agent.Config) error { 800 var supportedContainers []instance.ContainerType 801 supportsContainers := container.ContainersSupported() 802 if supportsContainers { 803 supportedContainers = append(supportedContainers, instance.LXD) 804 } 805 806 supportsKvm, err := kvm.IsKVMSupported() 807 if err != nil { 808 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 809 } 810 if err == nil && supportsKvm { 811 supportedContainers = append(supportedContainers, instance.KVM) 812 } 813 814 return a.updateSupportedContainers(runner, st, supportedContainers, agentConfig) 815 } 816 817 // updateSupportedContainers records in state that a machine can run the specified containers. 818 // It starts a watcher and when a container of a given type is first added to the machine, 819 // the watcher is killed, the machine is set up to be able to start containers of the given type, 820 // and a suitable provisioner is started. 821 func (a *MachineAgent) updateSupportedContainers( 822 runner worker.Runner, 823 st api.Connection, 824 containers []instance.ContainerType, 825 agentConfig agent.Config, 826 ) error { 827 pr := apiprovisioner.NewState(st) 828 tag := agentConfig.Tag().(names.MachineTag) 829 machine, err := pr.Machine(tag) 830 if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead { 831 return worker.ErrTerminateAgent 832 } 833 if err != nil { 834 return errors.Annotatef(err, "cannot load machine %s from state", tag) 835 } 836 if len(containers) == 0 { 837 if err := machine.SupportsNoContainers(); err != nil { 838 return errors.Annotatef(err, "clearing supported containers for %s", tag) 839 } 840 return nil 841 } 842 if err := machine.SetSupportedContainers(containers...); err != nil { 843 return errors.Annotatef(err, "setting supported containers for %s", tag) 844 } 845 // Start the watcher to fire when a container is first requested on the machine. 846 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 847 params := provisioner.ContainerSetupParams{ 848 Runner: runner, 849 WorkerName: watcherName, 850 SupportedContainers: containers, 851 Machine: machine, 852 Provisioner: pr, 853 Config: agentConfig, 854 InitLockName: agent.MachineLockName, 855 } 856 handler := provisioner.NewContainerSetupHandler(params) 857 a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) { 858 w, err := watcher.NewStringsWorker(watcher.StringsConfig{ 859 Handler: handler, 860 }) 861 if err != nil { 862 return nil, errors.Annotatef(err, "cannot start %s worker", watcherName) 863 } 864 return w, nil 865 }) 866 return nil 867 } 868 869 func (a *MachineAgent) initState(agentConfig agent.Config) (*state.State, error) { 870 // Start MongoDB server and dial. 871 if err := a.ensureMongoServer(agentConfig); err != nil { 872 return nil, err 873 } 874 875 st, _, err := openState(agentConfig, stateWorkerDialOpts) 876 if err != nil { 877 return nil, err 878 } 879 880 reportOpenedState(st) 881 882 return st, nil 883 } 884 885 // startStateWorkers returns a worker running all the workers that 886 // require a *state.State connection. 887 func (a *MachineAgent) startStateWorkers(st *state.State) (worker.Worker, error) { 888 agentConfig := a.CurrentConfig() 889 890 m, err := getMachine(st, agentConfig.Tag()) 891 if err != nil { 892 return nil, errors.Annotate(err, "machine lookup") 893 } 894 895 runner := worker.NewRunner( 896 cmdutil.PingerIsFatal(logger, st), 897 cmdutil.MoreImportant, 898 worker.RestartDelay, 899 ) 900 singularRunner, err := newSingularStateRunner(runner, st, m) 901 if err != nil { 902 return nil, errors.Trace(err) 903 } 904 905 for _, job := range m.Jobs() { 906 switch job { 907 case state.JobHostUnits: 908 // Implemented elsewhere with workers that use the API. 909 case state.JobManageModel: 910 useMultipleCPUs() 911 a.startWorkerAfterUpgrade(runner, "model worker manager", func() (worker.Worker, error) { 912 w, err := modelworkermanager.New(modelworkermanager.Config{ 913 ControllerUUID: st.ControllerUUID(), 914 Backend: st, 915 NewWorker: a.startModelWorkers, 916 ErrorDelay: worker.RestartDelay, 917 }) 918 if err != nil { 919 return nil, errors.Annotate(err, "cannot start model worker manager") 920 } 921 return w, nil 922 }) 923 a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) { 924 env, err := stateenvirons.GetNewEnvironFunc(environs.New)(st) 925 if err != nil { 926 return nil, errors.Annotate(err, "getting environ from state") 927 } 928 supportsSpaces := environs.SupportsSpaces(env) 929 w, err := peergrouperNew(st, supportsSpaces) 930 if err != nil { 931 return nil, errors.Annotate(err, "cannot start peergrouper worker") 932 } 933 return w, nil 934 }) 935 a.startWorkerAfterUpgrade(runner, "restore", func() (worker.Worker, error) { 936 w, err := a.newRestoreStateWatcherWorker(st) 937 if err != nil { 938 return nil, errors.Annotate(err, "cannot start backup-restorer worker") 939 } 940 return w, nil 941 }) 942 a.startWorkerAfterUpgrade(runner, "mongoupgrade", func() (worker.Worker, error) { 943 return newUpgradeMongoWorker(st, a.machineId, a.maybeStopMongo) 944 }) 945 946 // certChangedChan is shared by multiple workers it's up 947 // to the agent to close it rather than any one of the 948 // workers. It is possible that multiple cert changes 949 // come in before the apiserver is up to receive them. 950 // Specify a bigger buffer to prevent deadlock when 951 // the apiserver isn't up yet. Use a size of 10 since we 952 // allow up to 7 controllers, and might also update the 953 // addresses of the local machine (127.0.0.1, ::1, etc). 954 // 955 // TODO(cherylj/waigani) Remove this workaround when 956 // certupdater and apiserver can properly manage dependencies 957 // through the dependency engine. 958 // 959 // TODO(ericsnow) For now we simply do not close the channel. 960 certChangedChan := make(chan params.StateServingInfo, 10) 961 // Each time apiserver worker is restarted, we need a fresh copy of state due 962 // to the fact that state holds lease managers which are killed and need to be reset. 963 stateOpener := func() (*state.State, error) { 964 logger.Debugf("opening state for apiserver worker") 965 st, _, err := openState(agentConfig, stateWorkerDialOpts) 966 return st, err 967 } 968 runner.StartWorker("apiserver", a.apiserverWorkerStarter(stateOpener, certChangedChan)) 969 var stateServingSetter certupdater.StateServingInfoSetter = func(info params.StateServingInfo, done <-chan struct{}) error { 970 return a.ChangeConfig(func(config agent.ConfigSetter) error { 971 config.SetStateServingInfo(info) 972 logger.Infof("update apiserver worker with new certificate") 973 select { 974 case certChangedChan <- info: 975 return nil 976 case <-done: 977 return nil 978 } 979 }) 980 } 981 a.startWorkerAfterUpgrade(runner, "certupdater", func() (worker.Worker, error) { 982 return newCertificateUpdater(m, agentConfig, st, st, stateServingSetter), nil 983 }) 984 985 a.startWorkerAfterUpgrade(singularRunner, "dblogpruner", func() (worker.Worker, error) { 986 return dblogpruner.New(st, dblogpruner.NewLogPruneParams()), nil 987 }) 988 989 a.startWorkerAfterUpgrade(singularRunner, "txnpruner", func() (worker.Worker, error) { 990 return txnpruner.New(st, time.Hour*2, clock.WallClock), nil 991 }) 992 default: 993 return nil, errors.Errorf("unknown job type %q", job) 994 } 995 } 996 return runner, nil 997 } 998 999 // startModelWorkers starts the set of workers that run for every model 1000 // in each controller. 1001 func (a *MachineAgent) startModelWorkers(controllerUUID, modelUUID string) (worker.Worker, error) { 1002 modelAgent, err := model.WrapAgent(a, controllerUUID, modelUUID) 1003 if err != nil { 1004 return nil, errors.Trace(err) 1005 } 1006 1007 engine, err := dependency.NewEngine(dependency.EngineConfig{ 1008 IsFatal: model.IsFatal, 1009 WorstError: model.WorstError, 1010 Filter: model.IgnoreErrRemoved, 1011 ErrorDelay: 3 * time.Second, 1012 BounceDelay: 10 * time.Millisecond, 1013 }) 1014 if err != nil { 1015 return nil, errors.Trace(err) 1016 } 1017 1018 manifolds := modelManifolds(model.ManifoldsConfig{ 1019 Agent: modelAgent, 1020 AgentConfigChanged: a.configChangedVal, 1021 Clock: clock.WallClock, 1022 RunFlagDuration: time.Minute, 1023 CharmRevisionUpdateInterval: 24 * time.Hour, 1024 InstPollerAggregationDelay: 3 * time.Second, 1025 // TODO(perrito666) the status history pruning numbers need 1026 // to be adjusting, after collecting user data from large install 1027 // bases, to numbers allowing a rich and useful back history. 1028 StatusHistoryPrunerMaxHistoryTime: 336 * time.Hour, // 2 weeks 1029 StatusHistoryPrunerMaxHistoryMB: 5120, // 5G 1030 StatusHistoryPrunerInterval: 5 * time.Minute, 1031 SpacesImportedGate: a.discoverSpacesComplete, 1032 NewEnvironFunc: newEnvirons, 1033 NewMigrationMaster: migrationmaster.NewWorker, 1034 }) 1035 if err := dependency.Install(engine, manifolds); err != nil { 1036 if err := worker.Stop(engine); err != nil { 1037 logger.Errorf("while stopping engine with bad manifolds: %v", err) 1038 } 1039 return nil, errors.Trace(err) 1040 } 1041 return engine, nil 1042 } 1043 1044 // stateWorkerDialOpts is a mongo.DialOpts suitable 1045 // for use by StateWorker to dial mongo. 1046 // 1047 // This must be overridden in tests, as it assumes 1048 // journaling is enabled. 1049 var stateWorkerDialOpts mongo.DialOpts 1050 1051 func (a *MachineAgent) apiserverWorkerStarter( 1052 stateOpener func() (*state.State, error), certChanged chan params.StateServingInfo, 1053 ) func() (worker.Worker, error) { 1054 return func() (worker.Worker, error) { 1055 st, err := stateOpener() 1056 if err != nil { 1057 return nil, errors.Trace(err) 1058 } 1059 return a.newAPIserverWorker(st, certChanged) 1060 } 1061 } 1062 1063 func (a *MachineAgent) newAPIserverWorker(st *state.State, certChanged chan params.StateServingInfo) (worker.Worker, error) { 1064 agentConfig := a.CurrentConfig() 1065 // If the configuration does not have the required information, 1066 // it is currently not a recoverable error, so we kill the whole 1067 // agent, potentially enabling human intervention to fix 1068 // the agent's configuration file. 1069 info, ok := agentConfig.StateServingInfo() 1070 if !ok { 1071 return nil, &cmdutil.FatalError{"StateServingInfo not available and we need it"} 1072 } 1073 cert := info.Cert 1074 key := info.PrivateKey 1075 1076 if len(cert) == 0 || len(key) == 0 { 1077 return nil, &cmdutil.FatalError{"configuration does not have controller cert/key"} 1078 } 1079 tag := agentConfig.Tag() 1080 dataDir := agentConfig.DataDir() 1081 logDir := agentConfig.LogDir() 1082 1083 endpoint := net.JoinHostPort("", strconv.Itoa(info.APIPort)) 1084 listener, err := net.Listen("tcp", endpoint) 1085 if err != nil { 1086 return nil, err 1087 } 1088 1089 // TODO(katco): We should be doing something more serious than 1090 // logging audit errors. Failures in the auditing systems should 1091 // stop the api server until the problem can be corrected. 1092 auditErrorHandler := func(err error) { 1093 logger.Criticalf("%v", err) 1094 } 1095 1096 controllerConfig, err := st.ControllerConfig() 1097 if err != nil { 1098 return nil, errors.Annotate(err, "cannot fetch the controller config") 1099 } 1100 1101 server, err := apiserver.NewServer(st, listener, apiserver.ServerConfig{ 1102 Clock: clock.WallClock, 1103 Cert: cert, 1104 Key: key, 1105 Tag: tag, 1106 DataDir: dataDir, 1107 LogDir: logDir, 1108 Validator: a.limitLogins, 1109 CertChanged: certChanged, 1110 AutocertURL: controllerConfig.AutocertURL(), 1111 AutocertDNSName: controllerConfig.AutocertDNSName(), 1112 AllowModelAccess: controllerConfig.AllowModelAccess(), 1113 NewObserver: newObserverFn( 1114 controllerConfig, 1115 clock.WallClock, 1116 jujuversion.Current, 1117 agentConfig.Model().Id(), 1118 newAuditEntrySink(st, logDir), 1119 auditErrorHandler, 1120 ), 1121 }) 1122 if err != nil { 1123 return nil, errors.Annotate(err, "cannot start api server worker") 1124 } 1125 1126 return server, nil 1127 } 1128 1129 func newAuditEntrySink(st *state.State, logDir string) audit.AuditEntrySinkFn { 1130 persistFn := st.PutAuditEntryFn() 1131 fileSinkFn := audit.NewLogFileSink(logDir) 1132 return func(entry audit.AuditEntry) error { 1133 // We don't care about auditing anything but user actions. 1134 if _, err := names.ParseUserTag(entry.OriginName); err != nil { 1135 return nil 1136 } 1137 // TODO(wallyworld) - Pinger requests should not originate as a user action. 1138 if strings.HasPrefix(entry.Operation, "Pinger:") { 1139 return nil 1140 } 1141 persistErr := persistFn(entry) 1142 sinkErr := fileSinkFn(entry) 1143 if persistErr == nil { 1144 return errors.Annotate(sinkErr, "cannot save audit record to file") 1145 } 1146 if sinkErr == nil { 1147 return errors.Annotate(persistErr, "cannot save audit record to database") 1148 } 1149 return errors.Annotate(persistErr, "cannot save audit record to file or database") 1150 } 1151 } 1152 1153 func newObserverFn( 1154 controllerConfig controller.Config, 1155 clock clock.Clock, 1156 jujuServerVersion version.Number, 1157 modelUUID string, 1158 persistAuditEntry audit.AuditEntrySinkFn, 1159 auditErrorHandler observer.ErrorHandler, 1160 ) observer.ObserverFactory { 1161 1162 var observerFactories []observer.ObserverFactory 1163 1164 // Common logging of RPC requests 1165 observerFactories = append(observerFactories, func() observer.Observer { 1166 logger := loggo.GetLogger("juju.apiserver") 1167 ctx := observer.RequestObserverContext{ 1168 Clock: clock, 1169 Logger: logger, 1170 } 1171 return observer.NewRequestObserver(ctx) 1172 }) 1173 1174 // Auditing observer 1175 // TODO(katco): Auditing needs feature tests (lp:1604551) 1176 if controllerConfig.AuditingEnabled() { 1177 observerFactories = append(observerFactories, func() observer.Observer { 1178 ctx := &observer.AuditContext{ 1179 JujuServerVersion: jujuServerVersion, 1180 ModelUUID: modelUUID, 1181 } 1182 return observer.NewAudit(ctx, persistAuditEntry, auditErrorHandler) 1183 }) 1184 } 1185 1186 return observer.ObserverFactoryMultiplexer(observerFactories...) 1187 1188 } 1189 1190 // limitLogins is called by the API server for each login attempt. 1191 // it returns an error if upgrades or restore are running. 1192 func (a *MachineAgent) limitLogins(req params.LoginRequest) error { 1193 if err := a.limitLoginsDuringRestore(req); err != nil { 1194 return err 1195 } 1196 if err := a.limitLoginsDuringUpgrade(req); err != nil { 1197 return err 1198 } 1199 return a.limitLoginsDuringMongoUpgrade(req) 1200 } 1201 1202 func (a *MachineAgent) limitLoginsDuringMongoUpgrade(req params.LoginRequest) error { 1203 // If upgrade is running we will not be able to lock AgentConfigWriter 1204 // and it also means we are not upgrading mongo. 1205 if a.isUpgradeRunning() { 1206 return nil 1207 } 1208 cfg := a.AgentConfigWriter.CurrentConfig() 1209 ver := cfg.MongoVersion() 1210 if ver == mongo.MongoUpgrade { 1211 return errors.New("Upgrading Mongo") 1212 } 1213 return nil 1214 } 1215 1216 // limitLoginsDuringRestore will only allow logins for restore related purposes 1217 // while the different steps of restore are running. 1218 func (a *MachineAgent) limitLoginsDuringRestore(req params.LoginRequest) error { 1219 var err error 1220 switch { 1221 case a.IsRestoreRunning(): 1222 err = apiserver.RestoreInProgressError 1223 case a.IsRestorePreparing(): 1224 err = apiserver.AboutToRestoreError 1225 } 1226 if err != nil { 1227 authTag, parseErr := names.ParseTag(req.AuthTag) 1228 if parseErr != nil { 1229 return errors.Annotate(err, "could not parse auth tag") 1230 } 1231 switch authTag := authTag.(type) { 1232 case names.UserTag: 1233 // use a restricted API mode 1234 return err 1235 case names.MachineTag: 1236 if authTag == a.Tag() { 1237 // allow logins from the local machine 1238 return nil 1239 } 1240 } 1241 return errors.Errorf("login for %q blocked because restore is in progress", authTag) 1242 } 1243 return nil 1244 } 1245 1246 // limitLoginsDuringUpgrade is called by the API server for each login 1247 // attempt. It returns an error if upgrades are in progress unless the 1248 // login is for a user (i.e. a client) or the local machine. 1249 func (a *MachineAgent) limitLoginsDuringUpgrade(req params.LoginRequest) error { 1250 if a.isUpgradeRunning() || a.isInitialUpgradeCheckPending() { 1251 authTag, err := names.ParseTag(req.AuthTag) 1252 if err != nil { 1253 return errors.Annotate(err, "could not parse auth tag") 1254 } 1255 switch authTag := authTag.(type) { 1256 case names.UserTag: 1257 // use a restricted API mode 1258 return params.UpgradeInProgressError 1259 case names.MachineTag: 1260 if authTag == a.Tag() { 1261 // allow logins from the local machine 1262 return nil 1263 } 1264 } 1265 return errors.Errorf("login for %q blocked because %s", authTag, params.CodeUpgradeInProgress) 1266 } else { 1267 return nil // allow all logins 1268 } 1269 } 1270 1271 var stateWorkerServingConfigErr = errors.New("state worker started with no state serving info") 1272 1273 // ensureMongoServer ensures that mongo is installed and running, 1274 // and ready for opening a state connection. 1275 func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) (err error) { 1276 a.mongoInitMutex.Lock() 1277 defer a.mongoInitMutex.Unlock() 1278 if a.mongoInitialized { 1279 logger.Debugf("mongo is already initialized") 1280 return nil 1281 } 1282 defer func() { 1283 if err == nil { 1284 a.mongoInitialized = true 1285 } 1286 }() 1287 1288 mongoInstalled, err := mongo.IsServiceInstalled() 1289 if err != nil { 1290 return errors.Annotate(err, "error while checking if mongodb service is installed") 1291 } 1292 1293 if !mongoInstalled { 1294 // EnsureMongoServer installs/upgrades the init config as necessary. 1295 ensureServerParams, err := cmdutil.NewEnsureServerParams(agentConfig) 1296 if err != nil { 1297 return err 1298 } 1299 if err := cmdutil.EnsureMongoServer(ensureServerParams); err != nil { 1300 return err 1301 } 1302 } 1303 logger.Debugf("mongodb service is installed") 1304 1305 // Mongo is installed, record the version. 1306 err = a.ChangeConfig(func(config agent.ConfigSetter) error { 1307 config.SetMongoVersion(mongo.InstalledVersion()) 1308 return nil 1309 }) 1310 if err != nil { 1311 return errors.Annotate(err, "cannot set mongo version") 1312 } 1313 return nil 1314 } 1315 1316 func openState(agentConfig agent.Config, dialOpts mongo.DialOpts) (_ *state.State, _ *state.Machine, err error) { 1317 info, ok := agentConfig.MongoInfo() 1318 if !ok { 1319 return nil, nil, errors.Errorf("no state info available") 1320 } 1321 st, err := state.Open(agentConfig.Model(), agentConfig.Controller(), info, dialOpts, 1322 stateenvirons.GetNewPolicyFunc( 1323 stateenvirons.GetNewEnvironFunc(environs.New), 1324 ), 1325 ) 1326 if err != nil { 1327 return nil, nil, err 1328 } 1329 defer func() { 1330 if err != nil { 1331 st.Close() 1332 } 1333 }() 1334 m0, err := st.FindEntity(agentConfig.Tag()) 1335 if err != nil { 1336 if errors.IsNotFound(err) { 1337 err = worker.ErrTerminateAgent 1338 } 1339 return nil, nil, err 1340 } 1341 m := m0.(*state.Machine) 1342 if m.Life() == state.Dead { 1343 return nil, nil, worker.ErrTerminateAgent 1344 } 1345 // Check the machine nonce as provisioned matches the agent.Conf value. 1346 if !m.CheckProvisioned(agentConfig.Nonce()) { 1347 // The agent is running on a different machine to the one it 1348 // should be according to state. It must stop immediately. 1349 logger.Errorf("running machine %v agent on inappropriate instance", m) 1350 return nil, nil, worker.ErrTerminateAgent 1351 } 1352 return st, m, nil 1353 } 1354 1355 func getMachine(st *state.State, tag names.Tag) (*state.Machine, error) { 1356 m0, err := st.FindEntity(tag) 1357 if err != nil { 1358 return nil, err 1359 } 1360 return m0.(*state.Machine), nil 1361 } 1362 1363 // startWorkerAfterUpgrade starts a worker to run the specified child worker 1364 // but only after waiting for upgrades to complete. 1365 func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) { 1366 runner.StartWorker(name, func() (worker.Worker, error) { 1367 return a.upgradeWaiterWorker(name, start), nil 1368 }) 1369 } 1370 1371 // upgradeWaiterWorker runs the specified worker after upgrades have completed. 1372 func (a *MachineAgent) upgradeWaiterWorker(name string, start func() (worker.Worker, error)) worker.Worker { 1373 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 1374 // Wait for the agent upgrade and upgrade steps to complete (or for us to be stopped). 1375 for _, ch := range []<-chan struct{}{ 1376 a.upgradeComplete.Unlocked(), 1377 a.initialUpgradeCheckComplete.Unlocked(), 1378 } { 1379 select { 1380 case <-stop: 1381 return nil 1382 case <-ch: 1383 } 1384 } 1385 logger.Debugf("upgrades done, starting worker %q", name) 1386 1387 // Upgrades are done, start the worker. 1388 w, err := start() 1389 if err != nil { 1390 return err 1391 } 1392 // Wait for worker to finish or for us to be stopped. 1393 done := make(chan error, 1) 1394 go func() { 1395 done <- w.Wait() 1396 }() 1397 select { 1398 case err := <-done: 1399 return errors.Annotatef(err, "worker %q exited", name) 1400 case <-stop: 1401 logger.Debugf("stopping so killing worker %q", name) 1402 return worker.Stop(w) 1403 } 1404 }) 1405 } 1406 1407 // WorkersStarted returns a channel that's closed once all top level workers 1408 // have been started. This is provided for testing purposes. 1409 func (a *MachineAgent) WorkersStarted() <-chan struct{} { 1410 return a.workersStarted 1411 } 1412 1413 func (a *MachineAgent) Tag() names.Tag { 1414 return names.NewMachineTag(a.machineId) 1415 } 1416 1417 func (a *MachineAgent) createJujudSymlinks(dataDir string) error { 1418 jujud := filepath.Join(tools.ToolsDir(dataDir, a.Tag().String()), jujunames.Jujud) 1419 for _, link := range []string{jujuRun, jujuDumpLogs} { 1420 err := a.createSymlink(jujud, link) 1421 if err != nil { 1422 return errors.Annotatef(err, "failed to create %s symlink", link) 1423 } 1424 } 1425 return nil 1426 } 1427 1428 func (a *MachineAgent) createSymlink(target, link string) error { 1429 fullLink := utils.EnsureBaseDir(a.rootDir, link) 1430 1431 currentTarget, err := symlink.Read(fullLink) 1432 if err != nil && !os.IsNotExist(err) { 1433 return err 1434 } else if err == nil { 1435 // Link already in place - check it. 1436 if currentTarget == target { 1437 // Link already points to the right place - nothing to do. 1438 return nil 1439 } 1440 // Link points to the wrong place - delete it. 1441 if err := os.Remove(fullLink); err != nil { 1442 return err 1443 } 1444 } 1445 1446 if err := os.MkdirAll(filepath.Dir(fullLink), os.FileMode(0755)); err != nil { 1447 return err 1448 } 1449 return symlink.New(target, fullLink) 1450 } 1451 1452 func (a *MachineAgent) removeJujudSymlinks() (errs []error) { 1453 for _, link := range []string{jujuRun, jujuDumpLogs} { 1454 err := os.Remove(utils.EnsureBaseDir(a.rootDir, link)) 1455 if err != nil && !os.IsNotExist(err) { 1456 errs = append(errs, errors.Annotatef(err, "failed to remove %s symlink", link)) 1457 } 1458 } 1459 return 1460 } 1461 1462 func (a *MachineAgent) uninstallAgent() error { 1463 // We should only uninstall if the uninstall file is present. 1464 if !agent.CanUninstall(a) { 1465 logger.Infof("ignoring uninstall request") 1466 return nil 1467 } 1468 logger.Infof("uninstalling agent") 1469 1470 agentConfig := a.CurrentConfig() 1471 var errs []error 1472 agentServiceName := agentConfig.Value(agent.AgentServiceName) 1473 if agentServiceName == "" { 1474 // For backwards compatibility, handle lack of AgentServiceName. 1475 agentServiceName = os.Getenv("UPSTART_JOB") 1476 } 1477 1478 if agentServiceName != "" { 1479 svc, err := service.DiscoverService(agentServiceName, common.Conf{}) 1480 if err != nil { 1481 errs = append(errs, errors.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1482 } else if err := svc.Remove(); err != nil { 1483 errs = append(errs, errors.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1484 } 1485 } 1486 1487 errs = append(errs, a.removeJujudSymlinks()...) 1488 1489 // TODO(fwereade): surely this shouldn't be happening here? Once we're 1490 // at this point we should expect to be killed in short order; if this 1491 // work is remotely important we should be blocking machine death on 1492 // its completion. 1493 insideContainer := container.RunningInContainer() 1494 if insideContainer { 1495 // We're running inside a container, so loop devices may leak. Detach 1496 // any loop devices that are backed by files on this machine. 1497 if err := a.loopDeviceManager.DetachLoopDevices("/", agentConfig.DataDir()); err != nil { 1498 errs = append(errs, err) 1499 } 1500 } 1501 1502 if err := mongo.RemoveService(); err != nil { 1503 errs = append(errs, errors.Annotate(err, "cannot stop/remove mongo service")) 1504 } 1505 if err := os.RemoveAll(agentConfig.DataDir()); err != nil { 1506 errs = append(errs, err) 1507 } 1508 if len(errs) == 0 { 1509 return nil 1510 } 1511 return errors.Errorf("uninstall failed: %v", errs) 1512 } 1513 1514 type MongoSessioner interface { 1515 MongoSession() *mgo.Session 1516 } 1517 1518 func newSingularStateRunner(runner worker.Runner, st MongoSessioner, m *state.Machine) (worker.Runner, error) { 1519 singularStateConn := singularStateConn{st.MongoSession(), m} 1520 singularRunner, err := newSingularRunner(runner, singularStateConn) 1521 if err != nil { 1522 return nil, errors.Annotate(err, "cannot make singular State Runner") 1523 } 1524 return singularRunner, err 1525 } 1526 1527 // singularStateConn implements singular.Conn on 1528 // top of a State connection. 1529 type singularStateConn struct { 1530 session *mgo.Session 1531 machine *state.Machine 1532 } 1533 1534 func (c singularStateConn) IsMaster() (bool, error) { 1535 return mongo.IsMaster(c.session, c.machine) 1536 } 1537 1538 func (c singularStateConn) Ping() error { 1539 return c.session.Ping() 1540 } 1541 1542 func metricAPI(st api.Connection) (metricsmanager.MetricsManagerClient, error) { 1543 client, err := metricsmanager.NewClient(st) 1544 if err != nil { 1545 return nil, errors.Trace(err) 1546 } 1547 return client, nil 1548 } 1549 1550 // newDeployContext gives the tests the opportunity to create a deployer.Context 1551 // that can be used for testing so as to avoid (1) deploying units to the system 1552 // running the tests and (2) get access to the *State used internally, so that 1553 // tests can be run without waiting for the 5s watcher refresh time to which we would 1554 // otherwise be restricted. 1555 var newDeployContext = func(st *apideployer.State, agentConfig agent.Config) deployer.Context { 1556 return deployer.NewSimpleContext(agentConfig, st) 1557 }