github.com/axw/juju@v0.0.0-20161005053422-4bd6544d08d4/cmd/jujud/agent/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package agent 5 6 import ( 7 "fmt" 8 "net" 9 "os" 10 "path/filepath" 11 "runtime" 12 "strconv" 13 "strings" 14 "sync" 15 "time" 16 17 "github.com/juju/cmd" 18 "github.com/juju/errors" 19 "github.com/juju/gnuflag" 20 "github.com/juju/juju/api" 21 apiagent "github.com/juju/juju/api/agent" 22 "github.com/juju/juju/api/base" 23 apimachiner "github.com/juju/juju/api/machiner" 24 "github.com/juju/juju/controller" 25 "github.com/juju/loggo" 26 "github.com/juju/replicaset" 27 "github.com/juju/utils" 28 "github.com/juju/utils/clock" 29 "github.com/juju/utils/featureflag" 30 "github.com/juju/utils/series" 31 "github.com/juju/utils/set" 32 "github.com/juju/utils/symlink" 33 "github.com/juju/utils/voyeur" 34 "github.com/juju/version" 35 "gopkg.in/juju/charmrepo.v2-unstable" 36 "gopkg.in/juju/names.v2" 37 "gopkg.in/mgo.v2" 38 "gopkg.in/natefinch/lumberjack.v2" 39 "gopkg.in/tomb.v1" 40 41 "github.com/juju/juju/agent" 42 "github.com/juju/juju/agent/tools" 43 apideployer "github.com/juju/juju/api/deployer" 44 "github.com/juju/juju/api/metricsmanager" 45 apiprovisioner "github.com/juju/juju/api/provisioner" 46 "github.com/juju/juju/apiserver" 47 "github.com/juju/juju/apiserver/observer" 48 "github.com/juju/juju/apiserver/params" 49 "github.com/juju/juju/audit" 50 "github.com/juju/juju/cert" 51 "github.com/juju/juju/cmd/jujud/agent/machine" 52 "github.com/juju/juju/cmd/jujud/agent/model" 53 "github.com/juju/juju/cmd/jujud/reboot" 54 cmdutil "github.com/juju/juju/cmd/jujud/util" 55 "github.com/juju/juju/container" 56 "github.com/juju/juju/container/kvm" 57 "github.com/juju/juju/environs" 58 "github.com/juju/juju/environs/simplestreams" 59 "github.com/juju/juju/instance" 60 jujunames "github.com/juju/juju/juju/names" 61 "github.com/juju/juju/juju/paths" 62 "github.com/juju/juju/mongo" 63 "github.com/juju/juju/service" 64 "github.com/juju/juju/service/common" 65 "github.com/juju/juju/state" 66 "github.com/juju/juju/state/multiwatcher" 67 "github.com/juju/juju/state/stateenvirons" 68 "github.com/juju/juju/storage/looputil" 69 "github.com/juju/juju/upgrades" 70 jujuversion "github.com/juju/juju/version" 71 "github.com/juju/juju/watcher" 72 "github.com/juju/juju/worker" 73 "github.com/juju/juju/worker/apicaller" 74 "github.com/juju/juju/worker/certupdater" 75 "github.com/juju/juju/worker/conv2state" 76 "github.com/juju/juju/worker/dblogpruner" 77 "github.com/juju/juju/worker/dependency" 78 "github.com/juju/juju/worker/deployer" 79 "github.com/juju/juju/worker/gate" 80 "github.com/juju/juju/worker/imagemetadataworker" 81 "github.com/juju/juju/worker/introspection" 82 "github.com/juju/juju/worker/logsender" 83 "github.com/juju/juju/worker/migrationmaster" 84 "github.com/juju/juju/worker/modelworkermanager" 85 "github.com/juju/juju/worker/mongoupgrader" 86 "github.com/juju/juju/worker/peergrouper" 87 "github.com/juju/juju/worker/provisioner" 88 "github.com/juju/juju/worker/singular" 89 "github.com/juju/juju/worker/txnpruner" 90 "github.com/juju/juju/worker/upgradesteps" 91 ) 92 93 var ( 94 logger = loggo.GetLogger("juju.cmd.jujud") 95 jujuRun = paths.MustSucceed(paths.JujuRun(series.HostSeries())) 96 jujuDumpLogs = paths.MustSucceed(paths.JujuDumpLogs(series.HostSeries())) 97 98 // The following are defined as variables to allow the tests to 99 // intercept calls to the functions. In every case, they should 100 // be expressed as explicit dependencies, but nobody has yet had 101 // the intestinal fortitude to untangle this package. Be that 102 // person! Juju Needs You. 103 useMultipleCPUs = utils.UseMultipleCPUs 104 newSingularRunner = singular.New 105 peergrouperNew = peergrouper.New 106 newCertificateUpdater = certupdater.NewCertificateUpdater 107 newMetadataUpdater = imagemetadataworker.NewWorker 108 newUpgradeMongoWorker = mongoupgrader.New 109 reportOpenedState = func(*state.State) {} 110 111 modelManifolds = model.Manifolds 112 machineManifolds = machine.Manifolds 113 ) 114 115 // Variable to override in tests, default is true 116 var ProductionMongoWriteConcern = true 117 118 func init() { 119 stateWorkerDialOpts = mongo.DefaultDialOpts() 120 stateWorkerDialOpts.PostDial = func(session *mgo.Session) error { 121 safe := mgo.Safe{} 122 if ProductionMongoWriteConcern { 123 safe.J = true 124 _, err := replicaset.CurrentConfig(session) 125 if err == nil { 126 // set mongo to write-majority (writes only returned after 127 // replicated to a majority of replica-set members). 128 safe.WMode = "majority" 129 } 130 } 131 session.SetSafe(&safe) 132 return nil 133 } 134 } 135 136 // AgentInitializer handles initializing a type for use as a Jujud 137 // agent. 138 type AgentInitializer interface { 139 AddFlags(*gnuflag.FlagSet) 140 CheckArgs([]string) error 141 } 142 143 // AgentConfigWriter encapsulates disk I/O operations with the agent 144 // config. 145 type AgentConfigWriter interface { 146 // ReadConfig reads the config for the given tag from disk. 147 ReadConfig(tag string) error 148 // ChangeConfig executes the given agent.ConfigMutator in a 149 // thread-safe context. 150 ChangeConfig(agent.ConfigMutator) error 151 // CurrentConfig returns a copy of the in-memory agent config. 152 CurrentConfig() agent.Config 153 } 154 155 // NewMachineAgentCmd creates a Command which handles parsing 156 // command-line arguments and instantiating and running a 157 // MachineAgent. 158 func NewMachineAgentCmd( 159 ctx *cmd.Context, 160 machineAgentFactory func(string) *MachineAgent, 161 agentInitializer AgentInitializer, 162 configFetcher AgentConfigWriter, 163 ) cmd.Command { 164 return &machineAgentCmd{ 165 ctx: ctx, 166 machineAgentFactory: machineAgentFactory, 167 agentInitializer: agentInitializer, 168 currentConfig: configFetcher, 169 } 170 } 171 172 type machineAgentCmd struct { 173 cmd.CommandBase 174 175 // This group of arguments is required. 176 agentInitializer AgentInitializer 177 currentConfig AgentConfigWriter 178 machineAgentFactory func(string) *MachineAgent 179 ctx *cmd.Context 180 181 // This group is for debugging purposes. 182 logToStdErr bool 183 184 // The following are set via command-line flags. 185 machineId string 186 } 187 188 // Init is called by the cmd system to initialize the structure for 189 // running. 190 func (a *machineAgentCmd) Init(args []string) error { 191 192 if !names.IsValidMachine(a.machineId) { 193 return errors.Errorf("--machine-id option must be set, and expects a non-negative integer") 194 } 195 if err := a.agentInitializer.CheckArgs(args); err != nil { 196 return err 197 } 198 199 // Due to changes in the logging, and needing to care about old 200 // models that have been upgraded, we need to explicitly remove the 201 // file writer if one has been added, otherwise we will get duplicate 202 // lines of all logging in the log file. 203 loggo.RemoveWriter("logfile") 204 205 if a.logToStdErr { 206 return nil 207 } 208 209 err := a.currentConfig.ReadConfig(names.NewMachineTag(a.machineId).String()) 210 if err != nil { 211 return errors.Annotate(err, "cannot read agent configuration") 212 } 213 214 // the context's stderr is set as the loggo writer in github.com/juju/cmd/logging.go 215 a.ctx.Stderr = &lumberjack.Logger{ 216 Filename: agent.LogFilename(a.currentConfig.CurrentConfig()), 217 MaxSize: 300, // megabytes 218 MaxBackups: 2, 219 } 220 221 return nil 222 } 223 224 // Run instantiates a MachineAgent and runs it. 225 func (a *machineAgentCmd) Run(c *cmd.Context) error { 226 machineAgent := a.machineAgentFactory(a.machineId) 227 return machineAgent.Run(c) 228 } 229 230 // SetFlags adds the requisite flags to run this command. 231 func (a *machineAgentCmd) SetFlags(f *gnuflag.FlagSet) { 232 a.agentInitializer.AddFlags(f) 233 f.StringVar(&a.machineId, "machine-id", "", "id of the machine to run") 234 } 235 236 // Info returns usage information for the command. 237 func (a *machineAgentCmd) Info() *cmd.Info { 238 return &cmd.Info{ 239 Name: "machine", 240 Purpose: "run a juju machine agent", 241 } 242 } 243 244 // MachineAgentFactoryFn returns a function which instantiates a 245 // MachineAgent given a machineId. 246 func MachineAgentFactoryFn( 247 agentConfWriter AgentConfigWriter, 248 bufferedLogs logsender.LogRecordCh, 249 rootDir string, 250 ) func(string) *MachineAgent { 251 return func(machineId string) *MachineAgent { 252 return NewMachineAgent( 253 machineId, 254 agentConfWriter, 255 bufferedLogs, 256 worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant, worker.RestartDelay), 257 looputil.NewLoopDeviceManager(), 258 rootDir, 259 ) 260 } 261 } 262 263 // NewMachineAgent instantiates a new MachineAgent. 264 func NewMachineAgent( 265 machineId string, 266 agentConfWriter AgentConfigWriter, 267 bufferedLogs logsender.LogRecordCh, 268 runner worker.Runner, 269 loopDeviceManager looputil.LoopDeviceManager, 270 rootDir string, 271 ) *MachineAgent { 272 return &MachineAgent{ 273 machineId: machineId, 274 AgentConfigWriter: agentConfWriter, 275 configChangedVal: voyeur.NewValue(true), 276 bufferedLogs: bufferedLogs, 277 workersStarted: make(chan struct{}), 278 runner: runner, 279 rootDir: rootDir, 280 initialUpgradeCheckComplete: gate.NewLock(), 281 loopDeviceManager: loopDeviceManager, 282 } 283 } 284 285 // MachineAgent is responsible for tying together all functionality 286 // needed to orchestrate a Jujud instance which controls a machine. 287 type MachineAgent struct { 288 AgentConfigWriter 289 290 tomb tomb.Tomb 291 machineId string 292 runner worker.Runner 293 rootDir string 294 bufferedLogs logsender.LogRecordCh 295 configChangedVal *voyeur.Value 296 upgradeComplete gate.Lock 297 workersStarted chan struct{} 298 299 // XXX(fwereade): these smell strongly of goroutine-unsafeness. 300 restoreMode bool 301 restoring bool 302 303 // Used to signal that the upgrade worker will not 304 // reboot the agent on startup because there are no 305 // longer any immediately pending agent upgrades. 306 initialUpgradeCheckComplete gate.Lock 307 308 discoverSpacesComplete gate.Lock 309 310 mongoInitMutex sync.Mutex 311 mongoInitialized bool 312 313 loopDeviceManager looputil.LoopDeviceManager 314 } 315 316 // IsRestorePreparing returns bool representing if we are in restore mode 317 // but not running restore. 318 func (a *MachineAgent) IsRestorePreparing() bool { 319 return a.restoreMode && !a.restoring 320 } 321 322 // IsRestoreRunning returns bool representing if we are in restore mode 323 // and running the actual restore process. 324 func (a *MachineAgent) IsRestoreRunning() bool { 325 return a.restoring 326 } 327 328 func (a *MachineAgent) isUpgradeRunning() bool { 329 return !a.upgradeComplete.IsUnlocked() 330 } 331 332 func (a *MachineAgent) isInitialUpgradeCheckPending() bool { 333 return !a.initialUpgradeCheckComplete.IsUnlocked() 334 } 335 336 // Wait waits for the machine agent to finish. 337 func (a *MachineAgent) Wait() error { 338 return a.tomb.Wait() 339 } 340 341 // Stop stops the machine agent. 342 func (a *MachineAgent) Stop() error { 343 a.runner.Kill() 344 return a.tomb.Wait() 345 } 346 347 // upgradeCertificateDNSNames ensure that the controller certificate 348 // recorded in the agent config and also mongo server.pem contains the 349 // DNSNames entires required by Juju/ 350 func (a *MachineAgent) upgradeCertificateDNSNames() error { 351 agentConfig := a.CurrentConfig() 352 si, ok := agentConfig.StateServingInfo() 353 if !ok || si.CAPrivateKey == "" { 354 // No certificate information exists yet, nothing to do. 355 return nil 356 } 357 // Parse the current certificate to get the current dns names. 358 serverCert, err := cert.ParseCert(si.Cert) 359 if err != nil { 360 return err 361 } 362 update := false 363 dnsNames := set.NewStrings(serverCert.DNSNames...) 364 requiredDNSNames := []string{"local", "juju-apiserver", "juju-mongodb"} 365 for _, dnsName := range requiredDNSNames { 366 if dnsNames.Contains(dnsName) { 367 continue 368 } 369 dnsNames.Add(dnsName) 370 update = true 371 } 372 if !update { 373 return nil 374 } 375 // Write a new certificate to the mongo pem and agent config files. 376 si.Cert, si.PrivateKey, err = cert.NewDefaultServer(agentConfig.CACert(), si.CAPrivateKey, dnsNames.Values()) 377 if err != nil { 378 return err 379 } 380 if err := mongo.UpdateSSLKey(agentConfig.DataDir(), si.Cert, si.PrivateKey); err != nil { 381 return err 382 } 383 return a.AgentConfigWriter.ChangeConfig(func(config agent.ConfigSetter) error { 384 config.SetStateServingInfo(si) 385 return nil 386 }) 387 } 388 389 // Run runs a machine agent. 390 func (a *MachineAgent) Run(*cmd.Context) error { 391 392 defer a.tomb.Done() 393 if err := a.ReadConfig(a.Tag().String()); err != nil { 394 return errors.Errorf("cannot read agent configuration: %v", err) 395 } 396 397 logger.Infof("machine agent %v start (%s [%s])", a.Tag(), jujuversion.Current, runtime.Compiler) 398 if flags := featureflag.String(); flags != "" { 399 logger.Warningf("developer feature flags enabled: %s", flags) 400 } 401 if err := introspection.WriteProfileFunctions(); err != nil { 402 // This isn't fatal, just annoying. 403 logger.Errorf("failed to write profile funcs: %v", err) 404 } 405 406 // Before doing anything else, we need to make sure the certificate generated for 407 // use by mongo to validate controller connections is correct. This needs to be done 408 // before any possible restart of the mongo service. 409 // See bug http://pad.lv/1434680 410 if err := a.upgradeCertificateDNSNames(); err != nil { 411 return errors.Annotate(err, "error upgrading server certificate") 412 } 413 414 if upgradeComplete, err := upgradesteps.NewLock(a); err != nil { 415 return errors.Annotate(err, "error during creating upgrade completion channel") 416 } else { 417 a.upgradeComplete = upgradeComplete 418 } 419 420 agentConfig := a.CurrentConfig() 421 createEngine := a.makeEngineCreator(agentConfig.UpgradedToVersion()) 422 charmrepo.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache") 423 if err := a.createJujudSymlinks(agentConfig.DataDir()); err != nil { 424 return err 425 } 426 a.runner.StartWorker("engine", createEngine) 427 428 // At this point, all workers will have been configured to start 429 close(a.workersStarted) 430 err := a.runner.Wait() 431 switch errors.Cause(err) { 432 case worker.ErrTerminateAgent: 433 err = a.uninstallAgent() 434 case worker.ErrRebootMachine: 435 logger.Infof("Caught reboot error") 436 err = a.executeRebootOrShutdown(params.ShouldReboot) 437 case worker.ErrShutdownMachine: 438 logger.Infof("Caught shutdown error") 439 err = a.executeRebootOrShutdown(params.ShouldShutdown) 440 } 441 err = cmdutil.AgentDone(logger, err) 442 a.tomb.Kill(err) 443 return err 444 } 445 446 func (a *MachineAgent) makeEngineCreator(previousAgentVersion version.Number) func() (worker.Worker, error) { 447 return func() (worker.Worker, error) { 448 config := dependency.EngineConfig{ 449 IsFatal: cmdutil.IsFatal, 450 WorstError: cmdutil.MoreImportantError, 451 ErrorDelay: 3 * time.Second, 452 BounceDelay: 10 * time.Millisecond, 453 } 454 engine, err := dependency.NewEngine(config) 455 if err != nil { 456 return nil, err 457 } 458 manifolds := machineManifolds(machine.ManifoldsConfig{ 459 PreviousAgentVersion: previousAgentVersion, 460 Agent: agent.APIHostPortsSetter{Agent: a}, 461 RootDir: a.rootDir, 462 AgentConfigChanged: a.configChangedVal, 463 UpgradeStepsLock: a.upgradeComplete, 464 UpgradeCheckLock: a.initialUpgradeCheckComplete, 465 OpenState: a.initState, 466 OpenStateForUpgrade: a.openStateForUpgrade, 467 StartStateWorkers: a.startStateWorkers, 468 StartAPIWorkers: a.startAPIWorkers, 469 PreUpgradeSteps: upgrades.PreUpgradeSteps, 470 LogSource: a.bufferedLogs, 471 NewDeployContext: newDeployContext, 472 Clock: clock.WallClock, 473 ValidateMigration: a.validateMigration, 474 }) 475 if err := dependency.Install(engine, manifolds); err != nil { 476 if err := worker.Stop(engine); err != nil { 477 logger.Errorf("while stopping engine with bad manifolds: %v", err) 478 } 479 return nil, err 480 } 481 if err := startIntrospection(introspectionConfig{ 482 Agent: a, 483 Engine: engine, 484 WorkerFunc: introspection.NewWorker, 485 }); err != nil { 486 // If the introspection worker failed to start, we just log error 487 // but continue. It is very unlikely to happen in the real world 488 // as the only issue is connecting to the abstract domain socket 489 // and the agent is controlled by by the OS to only have one. 490 logger.Errorf("failed to start introspection worker: %v", err) 491 } 492 return engine, nil 493 } 494 } 495 496 func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error { 497 // At this stage, all API connections would have been closed 498 // We need to reopen the API to clear the reboot flag after 499 // scheduling the reboot. It may be cleaner to do this in the reboot 500 // worker, before returning the ErrRebootMachine. 501 conn, err := apicaller.OnlyConnect(a, api.Open) 502 if err != nil { 503 logger.Infof("Reboot: Error connecting to state") 504 return errors.Trace(err) 505 } 506 507 // block until all units/containers are ready, and reboot/shutdown 508 finalize, err := reboot.NewRebootWaiter(conn, a.CurrentConfig()) 509 if err != nil { 510 return errors.Trace(err) 511 } 512 513 logger.Infof("Reboot: Executing reboot") 514 err = finalize.ExecuteReboot(action) 515 if err != nil { 516 logger.Infof("Reboot: Error executing reboot: %v", err) 517 return errors.Trace(err) 518 } 519 // On windows, the shutdown command is asynchronous. We return ErrRebootMachine 520 // so the agent will simply exit without error pending reboot/shutdown. 521 return worker.ErrRebootMachine 522 } 523 524 func (a *MachineAgent) ChangeConfig(mutate agent.ConfigMutator) error { 525 err := a.AgentConfigWriter.ChangeConfig(mutate) 526 a.configChangedVal.Set(true) 527 return errors.Trace(err) 528 } 529 530 func (a *MachineAgent) maybeStopMongo(ver mongo.Version, isMaster bool) error { 531 if !a.mongoInitialized { 532 return nil 533 } 534 535 conf := a.AgentConfigWriter.CurrentConfig() 536 v := conf.MongoVersion() 537 538 logger.Errorf("Got version change %v", ver) 539 // TODO(perrito666) replace with "read-only" mode for environment when 540 // it is available. 541 if ver.NewerThan(v) > 0 { 542 err := a.AgentConfigWriter.ChangeConfig(func(config agent.ConfigSetter) error { 543 config.SetMongoVersion(mongo.MongoUpgrade) 544 return nil 545 }) 546 if err != nil { 547 return err 548 } 549 550 } 551 return nil 552 553 } 554 555 // PrepareRestore will flag the agent to allow only a limited set 556 // of commands defined in 557 // "github.com/juju/juju/apiserver".allowedMethodsAboutToRestore 558 // the most noteworthy is: 559 // Backups.Restore: this will ensure that we can do all the file movements 560 // required for restore and no one will do changes while we do that. 561 // it will return error if the machine is already in this state. 562 func (a *MachineAgent) PrepareRestore() error { 563 if a.restoreMode { 564 return errors.Errorf("already in restore mode") 565 } 566 a.restoreMode = true 567 return nil 568 } 569 570 // BeginRestore will flag the agent to disallow all commands since 571 // restore should be running and therefore making changes that 572 // would override anything done. 573 func (a *MachineAgent) BeginRestore() error { 574 switch { 575 case !a.restoreMode: 576 return errors.Errorf("not in restore mode, cannot begin restoration") 577 case a.restoring: 578 return errors.Errorf("already restoring") 579 } 580 a.restoring = true 581 return nil 582 } 583 584 // EndRestore will flag the agent to allow all commands 585 // This being invoked means that restore process failed 586 // since success restarts the agent. 587 func (a *MachineAgent) EndRestore() { 588 a.restoreMode = false 589 a.restoring = false 590 } 591 592 // newRestoreStateWatcherWorker will return a worker or err if there 593 // is a failure, the worker takes care of watching the state of 594 // restoreInfo doc and put the agent in the different restore modes. 595 func (a *MachineAgent) newRestoreStateWatcherWorker(st *state.State) (worker.Worker, error) { 596 rWorker := func(stopch <-chan struct{}) error { 597 return a.restoreStateWatcher(st, stopch) 598 } 599 return worker.NewSimpleWorker(rWorker), nil 600 } 601 602 // restoreChanged will be called whenever restoreInfo doc changes signaling a new 603 // step in the restore process. 604 func (a *MachineAgent) restoreChanged(st *state.State) error { 605 status, err := st.RestoreInfo().Status() 606 if err != nil { 607 return errors.Annotate(err, "cannot read restore state") 608 } 609 switch status { 610 case state.RestorePending: 611 a.PrepareRestore() 612 case state.RestoreInProgress: 613 a.BeginRestore() 614 case state.RestoreFailed: 615 a.EndRestore() 616 } 617 return nil 618 } 619 620 // restoreStateWatcher watches for restoreInfo looking for changes in the restore process. 621 func (a *MachineAgent) restoreStateWatcher(st *state.State, stopch <-chan struct{}) error { 622 restoreWatch := st.WatchRestoreInfoChanges() 623 defer func() { 624 restoreWatch.Kill() 625 restoreWatch.Wait() 626 }() 627 628 for { 629 select { 630 case <-restoreWatch.Changes(): 631 if err := a.restoreChanged(st); err != nil { 632 return err 633 } 634 case <-stopch: 635 return nil 636 } 637 } 638 } 639 640 var newEnvirons = environs.New 641 642 // startAPIWorkers is called to start workers which rely on the 643 // machine agent's API connection (via the apiworkers manifold). It 644 // returns a Runner with a number of workers attached to it. 645 // 646 // The workers started here need to be converted to run under the 647 // dependency engine. Once they have all been converted, this method - 648 // and the apiworkers manifold - can be removed. 649 func (a *MachineAgent) startAPIWorkers(apiConn api.Connection) (_ worker.Worker, outErr error) { 650 agentConfig := a.CurrentConfig() 651 652 entity, err := apiagent.NewState(apiConn).Entity(a.Tag()) 653 if err != nil { 654 return nil, errors.Trace(err) 655 } 656 657 var isModelManager bool 658 for _, job := range entity.Jobs() { 659 switch job { 660 case multiwatcher.JobManageModel: 661 isModelManager = true 662 default: 663 // TODO(dimitern): Once all workers moved over to using 664 // the API, report "unknown job type" here. 665 } 666 } 667 668 runner := newConnRunner(apiConn) 669 defer func() { 670 // If startAPIWorkers exits early with an error, stop the 671 // runner so that any already started runners aren't leaked. 672 if outErr != nil { 673 worker.Stop(runner) 674 } 675 }() 676 677 // Perform the operations needed to set up hosting for containers. 678 if err := a.setupContainerSupport(runner, apiConn, agentConfig); err != nil { 679 cause := errors.Cause(err) 680 if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent { 681 return nil, worker.ErrTerminateAgent 682 } 683 return nil, errors.Errorf("setting up container support: %v", err) 684 } 685 686 if isModelManager { 687 688 // Published image metadata for some providers are in simple streams. 689 // Providers that do not depend on simple streams do not need this worker. 690 env, err := environs.GetEnviron(apiagent.NewState(apiConn), newEnvirons) 691 if err != nil { 692 return nil, errors.Annotate(err, "getting environ") 693 } 694 if _, ok := env.(simplestreams.HasRegion); ok { 695 // Start worker that stores published image metadata in state. 696 runner.StartWorker("imagemetadata", func() (worker.Worker, error) { 697 return newMetadataUpdater(apiConn.MetadataUpdater()), nil 698 }) 699 } 700 701 // We don't have instance info set and the network config for the 702 // bootstrap machine only, so update it now. All the other machines will 703 // have instance info including network config set at provisioning time. 704 if err := a.setControllerNetworkConfig(apiConn); err != nil { 705 return nil, errors.Annotate(err, "setting controller network config") 706 } 707 } else { 708 runner.StartWorker("stateconverter", func() (worker.Worker, error) { 709 // TODO(fwereade): this worker needs its own facade. 710 facade := apimachiner.NewState(apiConn) 711 handler := conv2state.New(facade, a) 712 w, err := watcher.NewNotifyWorker(watcher.NotifyConfig{ 713 Handler: handler, 714 }) 715 if err != nil { 716 return nil, errors.Annotate(err, "cannot start controller promoter worker") 717 } 718 return w, nil 719 }) 720 } 721 return runner, nil 722 } 723 724 func (a *MachineAgent) setControllerNetworkConfig(apiConn api.Connection) error { 725 machinerAPI := apimachiner.NewState(apiConn) 726 agentConfig := a.CurrentConfig() 727 728 tag := agentConfig.Tag().(names.MachineTag) 729 machine, err := machinerAPI.Machine(tag) 730 if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead { 731 return worker.ErrTerminateAgent 732 } 733 if err != nil { 734 return errors.Annotatef(err, "cannot load machine %s from state", tag) 735 } 736 737 if err := machine.SetProviderNetworkConfig(); err != nil { 738 return errors.Annotate(err, "cannot set controller provider network config") 739 } 740 return nil 741 } 742 743 // Restart restarts the agent's service. 744 func (a *MachineAgent) Restart() error { 745 name := a.CurrentConfig().Value(agent.AgentServiceName) 746 return service.Restart(name) 747 } 748 749 // openStateForUpgrade exists to be passed into the upgradesteps 750 // worker. The upgradesteps worker opens state independently of the 751 // state worker so that it isn't affected by the state worker's 752 // lifetime. It ensures the MongoDB server is configured and started, 753 // and then opens a state connection. 754 // 755 // TODO(mjs)- review the need for this once the dependency engine is 756 // in use. Why can't upgradesteps depend on the main state connection? 757 func (a *MachineAgent) openStateForUpgrade() (*state.State, error) { 758 agentConfig := a.CurrentConfig() 759 if err := a.ensureMongoServer(agentConfig); err != nil { 760 return nil, errors.Trace(err) 761 } 762 info, ok := agentConfig.MongoInfo() 763 if !ok { 764 return nil, errors.New("no state info available") 765 } 766 st, err := state.Open(agentConfig.Model(), agentConfig.Controller(), info, mongo.DefaultDialOpts(), 767 stateenvirons.GetNewPolicyFunc( 768 stateenvirons.GetNewEnvironFunc(environs.New), 769 ), 770 ) 771 if err != nil { 772 return nil, errors.Trace(err) 773 } 774 return st, nil 775 } 776 777 // validateMigration is called by the migrationminion to help check 778 // that the agent will be ok when connected to a new controller. 779 func (a *MachineAgent) validateMigration(apiCaller base.APICaller) error { 780 // TODO(mjs) - more extensive checks to come. 781 facade := apimachiner.NewState(apiCaller) 782 _, err := facade.Machine(names.NewMachineTag(a.machineId)) 783 return errors.Trace(err) 784 } 785 786 // setupContainerSupport determines what containers can be run on this machine and 787 // initialises suitable infrastructure to support such containers. 788 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connection, agentConfig agent.Config) error { 789 var supportedContainers []instance.ContainerType 790 supportsContainers := container.ContainersSupported() 791 if supportsContainers { 792 supportedContainers = append(supportedContainers, instance.LXD) 793 } 794 795 supportsKvm, err := kvm.IsKVMSupported() 796 if err != nil { 797 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 798 } 799 if err == nil && supportsKvm { 800 supportedContainers = append(supportedContainers, instance.KVM) 801 } 802 803 return a.updateSupportedContainers(runner, st, supportedContainers, agentConfig) 804 } 805 806 // updateSupportedContainers records in state that a machine can run the specified containers. 807 // It starts a watcher and when a container of a given type is first added to the machine, 808 // the watcher is killed, the machine is set up to be able to start containers of the given type, 809 // and a suitable provisioner is started. 810 func (a *MachineAgent) updateSupportedContainers( 811 runner worker.Runner, 812 st api.Connection, 813 containers []instance.ContainerType, 814 agentConfig agent.Config, 815 ) error { 816 pr := apiprovisioner.NewState(st) 817 tag := agentConfig.Tag().(names.MachineTag) 818 machine, err := pr.Machine(tag) 819 if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead { 820 return worker.ErrTerminateAgent 821 } 822 if err != nil { 823 return errors.Annotatef(err, "cannot load machine %s from state", tag) 824 } 825 if len(containers) == 0 { 826 if err := machine.SupportsNoContainers(); err != nil { 827 return errors.Annotatef(err, "clearing supported containers for %s", tag) 828 } 829 return nil 830 } 831 if err := machine.SetSupportedContainers(containers...); err != nil { 832 return errors.Annotatef(err, "setting supported containers for %s", tag) 833 } 834 // Start the watcher to fire when a container is first requested on the machine. 835 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 836 params := provisioner.ContainerSetupParams{ 837 Runner: runner, 838 WorkerName: watcherName, 839 SupportedContainers: containers, 840 Machine: machine, 841 Provisioner: pr, 842 Config: agentConfig, 843 InitLockName: agent.MachineLockName, 844 } 845 handler := provisioner.NewContainerSetupHandler(params) 846 a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) { 847 w, err := watcher.NewStringsWorker(watcher.StringsConfig{ 848 Handler: handler, 849 }) 850 if err != nil { 851 return nil, errors.Annotatef(err, "cannot start %s worker", watcherName) 852 } 853 return w, nil 854 }) 855 return nil 856 } 857 858 func (a *MachineAgent) initState(agentConfig agent.Config) (*state.State, error) { 859 // Start MongoDB server and dial. 860 if err := a.ensureMongoServer(agentConfig); err != nil { 861 return nil, err 862 } 863 864 st, _, err := openState(agentConfig, stateWorkerDialOpts) 865 if err != nil { 866 return nil, err 867 } 868 869 reportOpenedState(st) 870 871 return st, nil 872 } 873 874 // startStateWorkers returns a worker running all the workers that 875 // require a *state.State connection. 876 func (a *MachineAgent) startStateWorkers(st *state.State) (worker.Worker, error) { 877 agentConfig := a.CurrentConfig() 878 879 m, err := getMachine(st, agentConfig.Tag()) 880 if err != nil { 881 return nil, errors.Annotate(err, "machine lookup") 882 } 883 884 runner := newConnRunner(st) 885 singularRunner, err := newSingularStateRunner(runner, st, m) 886 if err != nil { 887 return nil, errors.Trace(err) 888 } 889 890 for _, job := range m.Jobs() { 891 switch job { 892 case state.JobHostUnits: 893 // Implemented elsewhere with workers that use the API. 894 case state.JobManageModel: 895 useMultipleCPUs() 896 a.startWorkerAfterUpgrade(runner, "model worker manager", func() (worker.Worker, error) { 897 w, err := modelworkermanager.New(modelworkermanager.Config{ 898 ControllerUUID: st.ControllerUUID(), 899 Backend: st, 900 NewWorker: a.startModelWorkers, 901 ErrorDelay: worker.RestartDelay, 902 }) 903 if err != nil { 904 return nil, errors.Annotate(err, "cannot start model worker manager") 905 } 906 return w, nil 907 }) 908 a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) { 909 env, err := stateenvirons.GetNewEnvironFunc(environs.New)(st) 910 if err != nil { 911 return nil, errors.Annotate(err, "getting environ from state") 912 } 913 supportsSpaces := environs.SupportsSpaces(env) 914 w, err := peergrouperNew(st, supportsSpaces) 915 if err != nil { 916 return nil, errors.Annotate(err, "cannot start peergrouper worker") 917 } 918 return w, nil 919 }) 920 a.startWorkerAfterUpgrade(runner, "restore", func() (worker.Worker, error) { 921 w, err := a.newRestoreStateWatcherWorker(st) 922 if err != nil { 923 return nil, errors.Annotate(err, "cannot start backup-restorer worker") 924 } 925 return w, nil 926 }) 927 a.startWorkerAfterUpgrade(runner, "mongoupgrade", func() (worker.Worker, error) { 928 return newUpgradeMongoWorker(st, a.machineId, a.maybeStopMongo) 929 }) 930 931 // certChangedChan is shared by multiple workers it's up 932 // to the agent to close it rather than any one of the 933 // workers. It is possible that multiple cert changes 934 // come in before the apiserver is up to receive them. 935 // Specify a bigger buffer to prevent deadlock when 936 // the apiserver isn't up yet. Use a size of 10 since we 937 // allow up to 7 controllers, and might also update the 938 // addresses of the local machine (127.0.0.1, ::1, etc). 939 // 940 // TODO(cherylj/waigani) Remove this workaround when 941 // certupdater and apiserver can properly manage dependencies 942 // through the dependency engine. 943 // 944 // TODO(ericsnow) For now we simply do not close the channel. 945 certChangedChan := make(chan params.StateServingInfo, 10) 946 // Each time apiserver worker is restarted, we need a fresh copy of state due 947 // to the fact that state holds lease managers which are killed and need to be reset. 948 stateOpener := func() (*state.State, error) { 949 logger.Debugf("opening state for apiserver worker") 950 st, _, err := openState(agentConfig, stateWorkerDialOpts) 951 return st, err 952 } 953 runner.StartWorker("apiserver", a.apiserverWorkerStarter(stateOpener, certChangedChan)) 954 var stateServingSetter certupdater.StateServingInfoSetter = func(info params.StateServingInfo, done <-chan struct{}) error { 955 return a.ChangeConfig(func(config agent.ConfigSetter) error { 956 config.SetStateServingInfo(info) 957 logger.Infof("update apiserver worker with new certificate") 958 select { 959 case certChangedChan <- info: 960 return nil 961 case <-done: 962 return nil 963 } 964 }) 965 } 966 a.startWorkerAfterUpgrade(runner, "certupdater", func() (worker.Worker, error) { 967 return newCertificateUpdater(m, agentConfig, st, st, stateServingSetter), nil 968 }) 969 970 a.startWorkerAfterUpgrade(singularRunner, "dblogpruner", func() (worker.Worker, error) { 971 return dblogpruner.New(st, dblogpruner.NewLogPruneParams()), nil 972 }) 973 974 a.startWorkerAfterUpgrade(singularRunner, "txnpruner", func() (worker.Worker, error) { 975 return txnpruner.New(st, time.Hour*2, clock.WallClock), nil 976 }) 977 default: 978 return nil, errors.Errorf("unknown job type %q", job) 979 } 980 } 981 return runner, nil 982 } 983 984 // startModelWorkers starts the set of workers that run for every model 985 // in each controller. 986 func (a *MachineAgent) startModelWorkers(controllerUUID, modelUUID string) (worker.Worker, error) { 987 modelAgent, err := model.WrapAgent(a, controllerUUID, modelUUID) 988 if err != nil { 989 return nil, errors.Trace(err) 990 } 991 992 engine, err := dependency.NewEngine(dependency.EngineConfig{ 993 IsFatal: model.IsFatal, 994 WorstError: model.WorstError, 995 Filter: model.IgnoreErrRemoved, 996 ErrorDelay: 3 * time.Second, 997 BounceDelay: 10 * time.Millisecond, 998 }) 999 if err != nil { 1000 return nil, errors.Trace(err) 1001 } 1002 1003 manifolds := modelManifolds(model.ManifoldsConfig{ 1004 Agent: modelAgent, 1005 AgentConfigChanged: a.configChangedVal, 1006 Clock: clock.WallClock, 1007 RunFlagDuration: time.Minute, 1008 CharmRevisionUpdateInterval: 24 * time.Hour, 1009 InstPollerAggregationDelay: 3 * time.Second, 1010 // TODO(perrito666) the status history pruning numbers need 1011 // to be adjusting, after collecting user data from large install 1012 // bases, to numbers allowing a rich and useful back history. 1013 StatusHistoryPrunerMaxHistoryTime: 336 * time.Hour, // 2 weeks 1014 StatusHistoryPrunerMaxHistoryMB: 5120, // 5G 1015 StatusHistoryPrunerInterval: 5 * time.Minute, 1016 SpacesImportedGate: a.discoverSpacesComplete, 1017 NewEnvironFunc: newEnvirons, 1018 NewMigrationMaster: migrationmaster.NewWorker, 1019 }) 1020 if err := dependency.Install(engine, manifolds); err != nil { 1021 if err := worker.Stop(engine); err != nil { 1022 logger.Errorf("while stopping engine with bad manifolds: %v", err) 1023 } 1024 return nil, errors.Trace(err) 1025 } 1026 return engine, nil 1027 } 1028 1029 // stateWorkerDialOpts is a mongo.DialOpts suitable 1030 // for use by StateWorker to dial mongo. 1031 // 1032 // This must be overridden in tests, as it assumes 1033 // journaling is enabled. 1034 var stateWorkerDialOpts mongo.DialOpts 1035 1036 func (a *MachineAgent) apiserverWorkerStarter( 1037 stateOpener func() (*state.State, error), certChanged chan params.StateServingInfo, 1038 ) func() (worker.Worker, error) { 1039 return func() (worker.Worker, error) { 1040 st, err := stateOpener() 1041 if err != nil { 1042 return nil, errors.Trace(err) 1043 } 1044 return a.newAPIserverWorker(st, certChanged) 1045 } 1046 } 1047 1048 func (a *MachineAgent) newAPIserverWorker(st *state.State, certChanged chan params.StateServingInfo) (worker.Worker, error) { 1049 agentConfig := a.CurrentConfig() 1050 // If the configuration does not have the required information, 1051 // it is currently not a recoverable error, so we kill the whole 1052 // agent, potentially enabling human intervention to fix 1053 // the agent's configuration file. 1054 info, ok := agentConfig.StateServingInfo() 1055 if !ok { 1056 return nil, &cmdutil.FatalError{"StateServingInfo not available and we need it"} 1057 } 1058 cert := info.Cert 1059 key := info.PrivateKey 1060 1061 if len(cert) == 0 || len(key) == 0 { 1062 return nil, &cmdutil.FatalError{"configuration does not have controller cert/key"} 1063 } 1064 tag := agentConfig.Tag() 1065 dataDir := agentConfig.DataDir() 1066 logDir := agentConfig.LogDir() 1067 1068 endpoint := net.JoinHostPort("", strconv.Itoa(info.APIPort)) 1069 listener, err := net.Listen("tcp", endpoint) 1070 if err != nil { 1071 return nil, err 1072 } 1073 1074 // TODO(katco): We should be doing something more serious than 1075 // logging audit errors. Failures in the auditing systems should 1076 // stop the api server until the problem can be corrected. 1077 auditErrorHandler := func(err error) { 1078 logger.Criticalf("%v", err) 1079 } 1080 1081 controllerConfig, err := st.ControllerConfig() 1082 if err != nil { 1083 return nil, errors.Annotate(err, "cannot fetch the controller config") 1084 } 1085 1086 server, err := apiserver.NewServer(st, listener, apiserver.ServerConfig{ 1087 Clock: clock.WallClock, 1088 Cert: cert, 1089 Key: key, 1090 Tag: tag, 1091 DataDir: dataDir, 1092 LogDir: logDir, 1093 Validator: a.limitLogins, 1094 CertChanged: certChanged, 1095 AutocertURL: controllerConfig.AutocertURL(), 1096 AutocertDNSName: controllerConfig.AutocertDNSName(), 1097 NewObserver: newObserverFn( 1098 controllerConfig, 1099 clock.WallClock, 1100 jujuversion.Current, 1101 agentConfig.Model().Id(), 1102 newAuditEntrySink(st, logDir), 1103 auditErrorHandler, 1104 ), 1105 }) 1106 if err != nil { 1107 return nil, errors.Annotate(err, "cannot start api server worker") 1108 } 1109 1110 return server, nil 1111 } 1112 1113 func newAuditEntrySink(st *state.State, logDir string) audit.AuditEntrySinkFn { 1114 persistFn := st.PutAuditEntryFn() 1115 fileSinkFn := audit.NewLogFileSink(logDir) 1116 return func(entry audit.AuditEntry) error { 1117 // We don't care about auditing anything but user actions. 1118 if _, err := names.ParseUserTag(entry.OriginName); err != nil { 1119 return nil 1120 } 1121 // TODO(wallyworld) - Pinger requests should not originate as a user action. 1122 if strings.HasPrefix(entry.Operation, "Pinger:") { 1123 return nil 1124 } 1125 persistErr := persistFn(entry) 1126 sinkErr := fileSinkFn(entry) 1127 if persistErr == nil { 1128 return errors.Annotate(sinkErr, "cannot save audit record to file") 1129 } 1130 if sinkErr == nil { 1131 return errors.Annotate(persistErr, "cannot save audit record to database") 1132 } 1133 return errors.Annotate(persistErr, "cannot save audit record to file or database") 1134 } 1135 } 1136 1137 func newObserverFn( 1138 controllerConfig controller.Config, 1139 clock clock.Clock, 1140 jujuServerVersion version.Number, 1141 modelUUID string, 1142 persistAuditEntry audit.AuditEntrySinkFn, 1143 auditErrorHandler observer.ErrorHandler, 1144 ) observer.ObserverFactory { 1145 1146 var observerFactories []observer.ObserverFactory 1147 1148 // Common logging of RPC requests 1149 observerFactories = append(observerFactories, func() observer.Observer { 1150 logger := loggo.GetLogger("juju.apiserver") 1151 ctx := observer.RequestObserverContext{ 1152 Clock: clock, 1153 Logger: logger, 1154 } 1155 return observer.NewRequestObserver(ctx) 1156 }) 1157 1158 // Auditing observer 1159 // TODO(katco): Auditing needs feature tests (lp:1604551) 1160 if controllerConfig.AuditingEnabled() { 1161 observerFactories = append(observerFactories, func() observer.Observer { 1162 ctx := &observer.AuditContext{ 1163 JujuServerVersion: jujuServerVersion, 1164 ModelUUID: modelUUID, 1165 } 1166 return observer.NewAudit(ctx, persistAuditEntry, auditErrorHandler) 1167 }) 1168 } 1169 1170 return observer.ObserverFactoryMultiplexer(observerFactories...) 1171 1172 } 1173 1174 // limitLogins is called by the API server for each login attempt. 1175 // it returns an error if upgrades or restore are running. 1176 func (a *MachineAgent) limitLogins(req params.LoginRequest) error { 1177 if err := a.limitLoginsDuringRestore(req); err != nil { 1178 return err 1179 } 1180 if err := a.limitLoginsDuringUpgrade(req); err != nil { 1181 return err 1182 } 1183 return a.limitLoginsDuringMongoUpgrade(req) 1184 } 1185 1186 func (a *MachineAgent) limitLoginsDuringMongoUpgrade(req params.LoginRequest) error { 1187 // If upgrade is running we will not be able to lock AgentConfigWriter 1188 // and it also means we are not upgrading mongo. 1189 if a.isUpgradeRunning() { 1190 return nil 1191 } 1192 cfg := a.AgentConfigWriter.CurrentConfig() 1193 ver := cfg.MongoVersion() 1194 if ver == mongo.MongoUpgrade { 1195 return errors.New("Upgrading Mongo") 1196 } 1197 return nil 1198 } 1199 1200 // limitLoginsDuringRestore will only allow logins for restore related purposes 1201 // while the different steps of restore are running. 1202 func (a *MachineAgent) limitLoginsDuringRestore(req params.LoginRequest) error { 1203 var err error 1204 switch { 1205 case a.IsRestoreRunning(): 1206 err = apiserver.RestoreInProgressError 1207 case a.IsRestorePreparing(): 1208 err = apiserver.AboutToRestoreError 1209 } 1210 if err != nil { 1211 authTag, parseErr := names.ParseTag(req.AuthTag) 1212 if parseErr != nil { 1213 return errors.Annotate(err, "could not parse auth tag") 1214 } 1215 switch authTag := authTag.(type) { 1216 case names.UserTag: 1217 // use a restricted API mode 1218 return err 1219 case names.MachineTag: 1220 if authTag == a.Tag() { 1221 // allow logins from the local machine 1222 return nil 1223 } 1224 } 1225 return errors.Errorf("login for %q blocked because restore is in progress", authTag) 1226 } 1227 return nil 1228 } 1229 1230 // limitLoginsDuringUpgrade is called by the API server for each login 1231 // attempt. It returns an error if upgrades are in progress unless the 1232 // login is for a user (i.e. a client) or the local machine. 1233 func (a *MachineAgent) limitLoginsDuringUpgrade(req params.LoginRequest) error { 1234 if a.isUpgradeRunning() || a.isInitialUpgradeCheckPending() { 1235 authTag, err := names.ParseTag(req.AuthTag) 1236 if err != nil { 1237 return errors.Annotate(err, "could not parse auth tag") 1238 } 1239 switch authTag := authTag.(type) { 1240 case names.UserTag: 1241 // use a restricted API mode 1242 return params.UpgradeInProgressError 1243 case names.MachineTag: 1244 if authTag == a.Tag() { 1245 // allow logins from the local machine 1246 return nil 1247 } 1248 } 1249 return errors.Errorf("login for %q blocked because %s", authTag, params.CodeUpgradeInProgress) 1250 } else { 1251 return nil // allow all logins 1252 } 1253 } 1254 1255 var stateWorkerServingConfigErr = errors.New("state worker started with no state serving info") 1256 1257 // ensureMongoServer ensures that mongo is installed and running, 1258 // and ready for opening a state connection. 1259 func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) (err error) { 1260 a.mongoInitMutex.Lock() 1261 defer a.mongoInitMutex.Unlock() 1262 if a.mongoInitialized { 1263 logger.Debugf("mongo is already initialized") 1264 return nil 1265 } 1266 defer func() { 1267 if err == nil { 1268 a.mongoInitialized = true 1269 } 1270 }() 1271 1272 mongoInstalled, err := mongo.IsServiceInstalled() 1273 if err != nil { 1274 return errors.Annotate(err, "error while checking if mongodb service is installed") 1275 } 1276 1277 if !mongoInstalled { 1278 // EnsureMongoServer installs/upgrades the init config as necessary. 1279 ensureServerParams, err := cmdutil.NewEnsureServerParams(agentConfig) 1280 if err != nil { 1281 return err 1282 } 1283 if err := cmdutil.EnsureMongoServer(ensureServerParams); err != nil { 1284 return err 1285 } 1286 } 1287 logger.Debugf("mongodb service is installed") 1288 1289 // Mongo is installed, record the version. 1290 err = a.ChangeConfig(func(config agent.ConfigSetter) error { 1291 config.SetMongoVersion(mongo.InstalledVersion()) 1292 return nil 1293 }) 1294 if err != nil { 1295 return errors.Annotate(err, "cannot set mongo version") 1296 } 1297 return nil 1298 } 1299 1300 func openState(agentConfig agent.Config, dialOpts mongo.DialOpts) (_ *state.State, _ *state.Machine, err error) { 1301 info, ok := agentConfig.MongoInfo() 1302 if !ok { 1303 return nil, nil, errors.Errorf("no state info available") 1304 } 1305 st, err := state.Open(agentConfig.Model(), agentConfig.Controller(), info, dialOpts, 1306 stateenvirons.GetNewPolicyFunc( 1307 stateenvirons.GetNewEnvironFunc(environs.New), 1308 ), 1309 ) 1310 if err != nil { 1311 return nil, nil, err 1312 } 1313 defer func() { 1314 if err != nil { 1315 st.Close() 1316 } 1317 }() 1318 m0, err := st.FindEntity(agentConfig.Tag()) 1319 if err != nil { 1320 if errors.IsNotFound(err) { 1321 err = worker.ErrTerminateAgent 1322 } 1323 return nil, nil, err 1324 } 1325 m := m0.(*state.Machine) 1326 if m.Life() == state.Dead { 1327 return nil, nil, worker.ErrTerminateAgent 1328 } 1329 // Check the machine nonce as provisioned matches the agent.Conf value. 1330 if !m.CheckProvisioned(agentConfig.Nonce()) { 1331 // The agent is running on a different machine to the one it 1332 // should be according to state. It must stop immediately. 1333 logger.Errorf("running machine %v agent on inappropriate instance", m) 1334 return nil, nil, worker.ErrTerminateAgent 1335 } 1336 return st, m, nil 1337 } 1338 1339 func getMachine(st *state.State, tag names.Tag) (*state.Machine, error) { 1340 m0, err := st.FindEntity(tag) 1341 if err != nil { 1342 return nil, err 1343 } 1344 return m0.(*state.Machine), nil 1345 } 1346 1347 // startWorkerAfterUpgrade starts a worker to run the specified child worker 1348 // but only after waiting for upgrades to complete. 1349 func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) { 1350 runner.StartWorker(name, func() (worker.Worker, error) { 1351 return a.upgradeWaiterWorker(name, start), nil 1352 }) 1353 } 1354 1355 // upgradeWaiterWorker runs the specified worker after upgrades have completed. 1356 func (a *MachineAgent) upgradeWaiterWorker(name string, start func() (worker.Worker, error)) worker.Worker { 1357 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 1358 // Wait for the agent upgrade and upgrade steps to complete (or for us to be stopped). 1359 for _, ch := range []<-chan struct{}{ 1360 a.upgradeComplete.Unlocked(), 1361 a.initialUpgradeCheckComplete.Unlocked(), 1362 } { 1363 select { 1364 case <-stop: 1365 return nil 1366 case <-ch: 1367 } 1368 } 1369 logger.Debugf("upgrades done, starting worker %q", name) 1370 1371 // Upgrades are done, start the worker. 1372 w, err := start() 1373 if err != nil { 1374 return err 1375 } 1376 // Wait for worker to finish or for us to be stopped. 1377 done := make(chan error, 1) 1378 go func() { 1379 done <- w.Wait() 1380 }() 1381 select { 1382 case err := <-done: 1383 return errors.Annotatef(err, "worker %q exited", name) 1384 case <-stop: 1385 logger.Debugf("stopping so killing worker %q", name) 1386 return worker.Stop(w) 1387 } 1388 }) 1389 } 1390 1391 // WorkersStarted returns a channel that's closed once all top level workers 1392 // have been started. This is provided for testing purposes. 1393 func (a *MachineAgent) WorkersStarted() <-chan struct{} { 1394 return a.workersStarted 1395 } 1396 1397 func (a *MachineAgent) Tag() names.Tag { 1398 return names.NewMachineTag(a.machineId) 1399 } 1400 1401 func (a *MachineAgent) createJujudSymlinks(dataDir string) error { 1402 jujud := filepath.Join(tools.ToolsDir(dataDir, a.Tag().String()), jujunames.Jujud) 1403 for _, link := range []string{jujuRun, jujuDumpLogs} { 1404 err := a.createSymlink(jujud, link) 1405 if err != nil { 1406 return errors.Annotatef(err, "failed to create %s symlink", link) 1407 } 1408 } 1409 return nil 1410 } 1411 1412 func (a *MachineAgent) createSymlink(target, link string) error { 1413 fullLink := utils.EnsureBaseDir(a.rootDir, link) 1414 1415 currentTarget, err := symlink.Read(fullLink) 1416 if err != nil && !os.IsNotExist(err) { 1417 return err 1418 } else if err == nil { 1419 // Link already in place - check it. 1420 if currentTarget == target { 1421 // Link already points to the right place - nothing to do. 1422 return nil 1423 } 1424 // Link points to the wrong place - delete it. 1425 if err := os.Remove(fullLink); err != nil { 1426 return err 1427 } 1428 } 1429 1430 if err := os.MkdirAll(filepath.Dir(fullLink), os.FileMode(0755)); err != nil { 1431 return err 1432 } 1433 return symlink.New(target, fullLink) 1434 } 1435 1436 func (a *MachineAgent) removeJujudSymlinks() (errs []error) { 1437 for _, link := range []string{jujuRun, jujuDumpLogs} { 1438 err := os.Remove(utils.EnsureBaseDir(a.rootDir, link)) 1439 if err != nil && !os.IsNotExist(err) { 1440 errs = append(errs, errors.Annotatef(err, "failed to remove %s symlink", link)) 1441 } 1442 } 1443 return 1444 } 1445 1446 func (a *MachineAgent) uninstallAgent() error { 1447 // We should only uninstall if the uninstall file is present. 1448 if !agent.CanUninstall(a) { 1449 logger.Infof("ignoring uninstall request") 1450 return nil 1451 } 1452 logger.Infof("uninstalling agent") 1453 1454 agentConfig := a.CurrentConfig() 1455 var errs []error 1456 agentServiceName := agentConfig.Value(agent.AgentServiceName) 1457 if agentServiceName == "" { 1458 // For backwards compatibility, handle lack of AgentServiceName. 1459 agentServiceName = os.Getenv("UPSTART_JOB") 1460 } 1461 1462 if agentServiceName != "" { 1463 svc, err := service.DiscoverService(agentServiceName, common.Conf{}) 1464 if err != nil { 1465 errs = append(errs, errors.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1466 } else if err := svc.Remove(); err != nil { 1467 errs = append(errs, errors.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1468 } 1469 } 1470 1471 errs = append(errs, a.removeJujudSymlinks()...) 1472 1473 // TODO(fwereade): surely this shouldn't be happening here? Once we're 1474 // at this point we should expect to be killed in short order; if this 1475 // work is remotely important we should be blocking machine death on 1476 // its completion. 1477 insideContainer := container.RunningInContainer() 1478 if insideContainer { 1479 // We're running inside a container, so loop devices may leak. Detach 1480 // any loop devices that are backed by files on this machine. 1481 if err := a.loopDeviceManager.DetachLoopDevices("/", agentConfig.DataDir()); err != nil { 1482 errs = append(errs, err) 1483 } 1484 } 1485 1486 if err := mongo.RemoveService(); err != nil { 1487 errs = append(errs, errors.Annotate(err, "cannot stop/remove mongo service")) 1488 } 1489 if err := os.RemoveAll(agentConfig.DataDir()); err != nil { 1490 errs = append(errs, err) 1491 } 1492 if len(errs) == 0 { 1493 return nil 1494 } 1495 return errors.Errorf("uninstall failed: %v", errs) 1496 } 1497 1498 func newConnRunner(conns ...cmdutil.Pinger) worker.Runner { 1499 return worker.NewRunner(cmdutil.ConnectionIsFatal(logger, conns...), cmdutil.MoreImportant, worker.RestartDelay) 1500 } 1501 1502 type MongoSessioner interface { 1503 MongoSession() *mgo.Session 1504 } 1505 1506 func newSingularStateRunner(runner worker.Runner, st MongoSessioner, m *state.Machine) (worker.Runner, error) { 1507 singularStateConn := singularStateConn{st.MongoSession(), m} 1508 singularRunner, err := newSingularRunner(runner, singularStateConn) 1509 if err != nil { 1510 return nil, errors.Annotate(err, "cannot make singular State Runner") 1511 } 1512 return singularRunner, err 1513 } 1514 1515 // singularStateConn implements singular.Conn on 1516 // top of a State connection. 1517 type singularStateConn struct { 1518 session *mgo.Session 1519 machine *state.Machine 1520 } 1521 1522 func (c singularStateConn) IsMaster() (bool, error) { 1523 return mongo.IsMaster(c.session, c.machine) 1524 } 1525 1526 func (c singularStateConn) Ping() error { 1527 return c.session.Ping() 1528 } 1529 1530 func metricAPI(st api.Connection) (metricsmanager.MetricsManagerClient, error) { 1531 client, err := metricsmanager.NewClient(st) 1532 if err != nil { 1533 return nil, errors.Trace(err) 1534 } 1535 return client, nil 1536 } 1537 1538 // newDeployContext gives the tests the opportunity to create a deployer.Context 1539 // that can be used for testing so as to avoid (1) deploying units to the system 1540 // running the tests and (2) get access to the *State used internally, so that 1541 // tests can be run without waiting for the 5s watcher refresh time to which we would 1542 // otherwise be restricted. 1543 var newDeployContext = func(st *apideployer.State, agentConfig agent.Config) deployer.Context { 1544 return deployer.NewSimpleContext(agentConfig, st) 1545 }