github.com/Pankov404/juju@v0.0.0-20150703034450-be266991dceb/cmd/jujud/agent/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package agent 5 6 import ( 7 "fmt" 8 "io" 9 "net" 10 "os" 11 "path/filepath" 12 "runtime" 13 "strconv" 14 "sync" 15 "time" 16 17 "github.com/juju/cmd" 18 "github.com/juju/errors" 19 "github.com/juju/loggo" 20 "github.com/juju/names" 21 "github.com/juju/replicaset" 22 "github.com/juju/utils" 23 "github.com/juju/utils/featureflag" 24 "github.com/juju/utils/set" 25 "github.com/juju/utils/symlink" 26 "github.com/juju/utils/voyeur" 27 "gopkg.in/juju/charm.v5/charmrepo" 28 "gopkg.in/mgo.v2" 29 "gopkg.in/natefinch/lumberjack.v2" 30 "launchpad.net/gnuflag" 31 "launchpad.net/tomb" 32 33 "github.com/juju/juju/agent" 34 "github.com/juju/juju/api" 35 apiagent "github.com/juju/juju/api/agent" 36 apideployer "github.com/juju/juju/api/deployer" 37 "github.com/juju/juju/api/metricsmanager" 38 apiupgrader "github.com/juju/juju/api/upgrader" 39 "github.com/juju/juju/apiserver" 40 "github.com/juju/juju/apiserver/params" 41 "github.com/juju/juju/cert" 42 "github.com/juju/juju/cmd/jujud/reboot" 43 cmdutil "github.com/juju/juju/cmd/jujud/util" 44 "github.com/juju/juju/container" 45 "github.com/juju/juju/container/kvm" 46 "github.com/juju/juju/container/lxc" 47 "github.com/juju/juju/environs" 48 "github.com/juju/juju/environs/config" 49 "github.com/juju/juju/feature" 50 "github.com/juju/juju/instance" 51 jujunames "github.com/juju/juju/juju/names" 52 "github.com/juju/juju/juju/paths" 53 "github.com/juju/juju/lease" 54 "github.com/juju/juju/mongo" 55 "github.com/juju/juju/network" 56 "github.com/juju/juju/provider" 57 "github.com/juju/juju/service" 58 "github.com/juju/juju/service/common" 59 "github.com/juju/juju/state" 60 "github.com/juju/juju/state/multiwatcher" 61 statestorage "github.com/juju/juju/state/storage" 62 coretools "github.com/juju/juju/tools" 63 "github.com/juju/juju/version" 64 "github.com/juju/juju/worker" 65 "github.com/juju/juju/worker/addresser" 66 "github.com/juju/juju/worker/apiaddressupdater" 67 "github.com/juju/juju/worker/authenticationworker" 68 "github.com/juju/juju/worker/certupdater" 69 "github.com/juju/juju/worker/charmrevisionworker" 70 "github.com/juju/juju/worker/cleaner" 71 "github.com/juju/juju/worker/conv2state" 72 "github.com/juju/juju/worker/dblogpruner" 73 "github.com/juju/juju/worker/deployer" 74 "github.com/juju/juju/worker/diskmanager" 75 "github.com/juju/juju/worker/envworkermanager" 76 "github.com/juju/juju/worker/firewaller" 77 "github.com/juju/juju/worker/instancepoller" 78 "github.com/juju/juju/worker/localstorage" 79 workerlogger "github.com/juju/juju/worker/logger" 80 "github.com/juju/juju/worker/machiner" 81 "github.com/juju/juju/worker/metricworker" 82 "github.com/juju/juju/worker/minunitsworker" 83 "github.com/juju/juju/worker/networker" 84 "github.com/juju/juju/worker/peergrouper" 85 "github.com/juju/juju/worker/provisioner" 86 "github.com/juju/juju/worker/proxyupdater" 87 rebootworker "github.com/juju/juju/worker/reboot" 88 "github.com/juju/juju/worker/resumer" 89 "github.com/juju/juju/worker/rsyslog" 90 "github.com/juju/juju/worker/singular" 91 "github.com/juju/juju/worker/statushistorypruner" 92 "github.com/juju/juju/worker/storageprovisioner" 93 "github.com/juju/juju/worker/terminationworker" 94 "github.com/juju/juju/worker/txnpruner" 95 "github.com/juju/juju/worker/upgrader" 96 ) 97 98 const bootstrapMachineId = "0" 99 100 var ( 101 logger = loggo.GetLogger("juju.cmd.jujud") 102 retryDelay = 3 * time.Second 103 JujuRun = paths.MustSucceed(paths.JujuRun(version.Current.Series)) 104 105 // The following are defined as variables to allow the tests to 106 // intercept calls to the functions. 107 useMultipleCPUs = utils.UseMultipleCPUs 108 maybeInitiateMongoServer = peergrouper.MaybeInitiateMongoServer 109 ensureMongoAdminUser = mongo.EnsureAdminUser 110 newSingularRunner = singular.New 111 peergrouperNew = peergrouper.New 112 newNetworker = networker.NewNetworker 113 newFirewaller = firewaller.NewFirewaller 114 newDiskManager = diskmanager.NewWorker 115 newStorageWorker = storageprovisioner.NewStorageProvisioner 116 newCertificateUpdater = certupdater.NewCertificateUpdater 117 newResumer = resumer.NewResumer 118 newInstancePoller = instancepoller.NewWorker 119 newCleaner = cleaner.NewCleaner 120 reportOpenedState = func(io.Closer) {} 121 reportOpenedAPI = func(io.Closer) {} 122 reportClosedMachineAPI = func(io.Closer) {} 123 getMetricAPI = metricAPI 124 ) 125 126 // Variable to override in tests, default is true 127 var ProductionMongoWriteConcern = true 128 129 func init() { 130 stateWorkerDialOpts = mongo.DefaultDialOpts() 131 stateWorkerDialOpts.PostDial = func(session *mgo.Session) error { 132 safe := mgo.Safe{} 133 if ProductionMongoWriteConcern { 134 safe.J = true 135 _, err := replicaset.CurrentConfig(session) 136 if err == nil { 137 // set mongo to write-majority (writes only returned after 138 // replicated to a majority of replica-set members). 139 safe.WMode = "majority" 140 } 141 } 142 session.SetSafe(&safe) 143 return nil 144 } 145 } 146 147 // AgentInitializer handles initializing a type for use as a Jujud 148 // agent. 149 type AgentInitializer interface { 150 AddFlags(*gnuflag.FlagSet) 151 CheckArgs([]string) error 152 } 153 154 // AgentConfigWriter encapsulates disk I/O operations with the agent 155 // config. 156 type AgentConfigWriter interface { 157 // ReadConfig reads the config for the given tag from disk. 158 ReadConfig(tag string) error 159 // ChangeConfig executes the given agent.ConfigMutator in a 160 // thread-safe context. 161 ChangeConfig(agent.ConfigMutator) error 162 // CurrentConfig returns a copy of the in-memory agent config. 163 CurrentConfig() agent.Config 164 } 165 166 // NewMachineAgentCmd creates a Command which handles parsing 167 // command-line arguments and instantiating and running a 168 // MachineAgent. 169 func NewMachineAgentCmd( 170 ctx *cmd.Context, 171 machineAgentFactory func(string) *MachineAgent, 172 agentInitializer AgentInitializer, 173 configFetcher AgentConfigWriter, 174 ) cmd.Command { 175 return &machineAgentCmd{ 176 ctx: ctx, 177 machineAgentFactory: machineAgentFactory, 178 agentInitializer: agentInitializer, 179 currentConfig: configFetcher, 180 } 181 } 182 183 type machineAgentCmd struct { 184 cmd.CommandBase 185 186 // This group of arguments is required. 187 agentInitializer AgentInitializer 188 currentConfig AgentConfigWriter 189 machineAgentFactory func(string) *MachineAgent 190 ctx *cmd.Context 191 192 // This group is for debugging purposes. 193 logToStdErr bool 194 195 // The following are set via command-line flags. 196 machineId string 197 } 198 199 // Init is called by the cmd system to initialize the structure for 200 // running. 201 func (a *machineAgentCmd) Init(args []string) error { 202 203 if !names.IsValidMachine(a.machineId) { 204 return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer") 205 } 206 if err := a.agentInitializer.CheckArgs(args); err != nil { 207 return err 208 } 209 210 // Due to changes in the logging, and needing to care about old 211 // environments that have been upgraded, we need to explicitly remove the 212 // file writer if one has been added, otherwise we will get duplicate 213 // lines of all logging in the log file. 214 loggo.RemoveWriter("logfile") 215 216 if a.logToStdErr { 217 return nil 218 } 219 220 err := a.currentConfig.ReadConfig(names.NewMachineTag(a.machineId).String()) 221 if err != nil { 222 return errors.Annotate(err, "cannot read agent configuration") 223 } 224 agentConfig := a.currentConfig.CurrentConfig() 225 226 // the context's stderr is set as the loggo writer in github.com/juju/cmd/logging.go 227 a.ctx.Stderr = &lumberjack.Logger{ 228 Filename: agent.LogFilename(agentConfig), 229 MaxSize: 300, // megabytes 230 MaxBackups: 2, 231 } 232 233 return nil 234 } 235 236 // Run instantiates a MachineAgent and runs it. 237 func (a *machineAgentCmd) Run(c *cmd.Context) error { 238 machineAgent := a.machineAgentFactory(a.machineId) 239 return machineAgent.Run(c) 240 } 241 242 // SetFlags adds the requisite flags to run this command. 243 func (a *machineAgentCmd) SetFlags(f *gnuflag.FlagSet) { 244 a.agentInitializer.AddFlags(f) 245 f.StringVar(&a.machineId, "machine-id", "", "id of the machine to run") 246 } 247 248 // Info returns usage information for the command. 249 func (a *machineAgentCmd) Info() *cmd.Info { 250 return &cmd.Info{ 251 Name: "machine", 252 Purpose: "run a juju machine agent", 253 } 254 } 255 256 // MachineAgentFactoryFn returns a function which instantiates a 257 // MachineAgent given a machineId. 258 func MachineAgentFactoryFn( 259 agentConfWriter AgentConfigWriter, 260 apiAddressSetter apiaddressupdater.APIAddressSetter, 261 ) func(string) *MachineAgent { 262 return func(machineId string) *MachineAgent { 263 return NewMachineAgent( 264 machineId, 265 agentConfWriter, 266 apiAddressSetter, 267 NewUpgradeWorkerContext(), 268 worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant), 269 ) 270 } 271 } 272 273 // NewMachineAgent instantiates a new MachineAgent. 274 func NewMachineAgent( 275 machineId string, 276 agentConfWriter AgentConfigWriter, 277 apiAddressSetter apiaddressupdater.APIAddressSetter, 278 upgradeWorkerContext *upgradeWorkerContext, 279 runner worker.Runner, 280 ) *MachineAgent { 281 282 return &MachineAgent{ 283 machineId: machineId, 284 AgentConfigWriter: agentConfWriter, 285 apiAddressSetter: apiAddressSetter, 286 workersStarted: make(chan struct{}), 287 upgradeWorkerContext: upgradeWorkerContext, 288 runner: runner, 289 initialAgentUpgradeCheckComplete: make(chan struct{}), 290 } 291 } 292 293 // APIStateUpgrader defines the methods on the Upgrader that 294 // agents call. 295 type APIStateUpgrader interface { 296 SetVersion(string, version.Binary) error 297 } 298 299 // MachineAgent is responsible for tying together all functionality 300 // needed to orchestarte a Jujud instance which controls a machine. 301 type MachineAgent struct { 302 AgentConfigWriter 303 304 tomb tomb.Tomb 305 machineId string 306 previousAgentVersion version.Number 307 apiAddressSetter apiaddressupdater.APIAddressSetter 308 runner worker.Runner 309 configChangedVal voyeur.Value 310 upgradeWorkerContext *upgradeWorkerContext 311 restoreMode bool 312 restoring bool 313 workersStarted chan struct{} 314 315 // Used to signal that the upgrade worker will not 316 // reboot the agent on startup because there are no 317 // longer any immediately pending agent upgrades. 318 // Channel used as a selectable bool (closed means true). 319 initialAgentUpgradeCheckComplete chan struct{} 320 321 mongoInitMutex sync.Mutex 322 mongoInitialized bool 323 324 apiStateUpgrader APIStateUpgrader 325 } 326 327 func (a *MachineAgent) getUpgrader(st *api.State) APIStateUpgrader { 328 if a.apiStateUpgrader != nil { 329 return a.apiStateUpgrader 330 } 331 return st.Upgrader() 332 } 333 334 // IsRestorePreparing returns bool representing if we are in restore mode 335 // but not running restore. 336 func (a *MachineAgent) IsRestorePreparing() bool { 337 return a.restoreMode && !a.restoring 338 } 339 340 // IsRestoreRunning returns bool representing if we are in restore mode 341 // and running the actual restore process. 342 func (a *MachineAgent) IsRestoreRunning() bool { 343 return a.restoring 344 } 345 346 func (a *MachineAgent) isAgentUpgradePending() bool { 347 select { 348 case <-a.initialAgentUpgradeCheckComplete: 349 return false 350 default: 351 return true 352 } 353 } 354 355 // Wait waits for the machine agent to finish. 356 func (a *MachineAgent) Wait() error { 357 return a.tomb.Wait() 358 } 359 360 // Stop stops the machine agent. 361 func (a *MachineAgent) Stop() error { 362 a.runner.Kill() 363 return a.tomb.Wait() 364 } 365 366 // Dying returns the channel that can be used to see if the machine 367 // agent is terminating. 368 func (a *MachineAgent) Dying() <-chan struct{} { 369 return a.tomb.Dying() 370 } 371 372 // upgradeCertificateDNSNames ensure that the state server certificate 373 // recorded in the agent config and also mongo server.pem contains the 374 // DNSNames entires required by Juju/ 375 func (a *MachineAgent) upgradeCertificateDNSNames() error { 376 agentConfig := a.CurrentConfig() 377 si, ok := agentConfig.StateServingInfo() 378 if !ok || si.CAPrivateKey == "" { 379 // No certificate information exists yet, nothing to do. 380 return nil 381 } 382 // Parse the current certificate to get the current dns names. 383 serverCert, err := cert.ParseCert(si.Cert) 384 if err != nil { 385 return err 386 } 387 update := false 388 dnsNames := set.NewStrings(serverCert.DNSNames...) 389 requiredDNSNames := []string{"local", "juju-apiserver", "juju-mongodb"} 390 for _, dnsName := range requiredDNSNames { 391 if dnsNames.Contains(dnsName) { 392 continue 393 } 394 dnsNames.Add(dnsName) 395 update = true 396 } 397 if !update { 398 return nil 399 } 400 // Write a new certificate to the mongp pem and agent config files. 401 si.Cert, si.PrivateKey, err = cert.NewDefaultServer(agentConfig.CACert(), si.CAPrivateKey, dnsNames.Values()) 402 if err != nil { 403 return err 404 } 405 if err := mongo.UpdateSSLKey(agentConfig.DataDir(), si.Cert, si.PrivateKey); err != nil { 406 return err 407 } 408 return a.AgentConfigWriter.ChangeConfig(func(config agent.ConfigSetter) error { 409 config.SetStateServingInfo(si) 410 return nil 411 }) 412 } 413 414 // Run runs a machine agent. 415 func (a *MachineAgent) Run(*cmd.Context) error { 416 417 defer a.tomb.Done() 418 if err := a.ReadConfig(a.Tag().String()); err != nil { 419 return fmt.Errorf("cannot read agent configuration: %v", err) 420 } 421 422 logger.Infof("machine agent %v start (%s [%s])", a.Tag(), version.Current, runtime.Compiler) 423 if flags := featureflag.String(); flags != "" { 424 logger.Warningf("developer feature flags enabled: %s", flags) 425 } 426 427 // Before doing anything else, we need to make sure the certificate generated for 428 // use by mongo to validate state server connections is correct. This needs to be done 429 // before any possible restart of the mongo service. 430 // See bug http://pad.lv/1434680 431 if err := a.upgradeCertificateDNSNames(); err != nil { 432 return errors.Annotate(err, "error upgrading server certificate") 433 } 434 agentConfig := a.CurrentConfig() 435 436 if err := a.upgradeWorkerContext.InitializeUsingAgent(a); err != nil { 437 return errors.Annotate(err, "error during upgradeWorkerContext initialisation") 438 } 439 a.configChangedVal.Set(struct{}{}) 440 a.previousAgentVersion = agentConfig.UpgradedToVersion() 441 network.InitializeFromConfig(agentConfig) 442 charmrepo.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache") 443 if err := a.createJujuRun(agentConfig.DataDir()); err != nil { 444 return fmt.Errorf("cannot create juju run symlink: %v", err) 445 } 446 a.runner.StartWorker("api", a.APIWorker) 447 a.runner.StartWorker("statestarter", a.newStateStarterWorker) 448 a.runner.StartWorker("termination", func() (worker.Worker, error) { 449 return terminationworker.NewWorker(), nil 450 }) 451 // At this point, all workers will have been configured to start 452 close(a.workersStarted) 453 err := a.runner.Wait() 454 switch err { 455 case worker.ErrTerminateAgent: 456 err = a.uninstallAgent(agentConfig) 457 case worker.ErrRebootMachine: 458 logger.Infof("Caught reboot error") 459 err = a.executeRebootOrShutdown(params.ShouldReboot) 460 case worker.ErrShutdownMachine: 461 logger.Infof("Caught shutdown error") 462 err = a.executeRebootOrShutdown(params.ShouldShutdown) 463 } 464 err = cmdutil.AgentDone(logger, err) 465 a.tomb.Kill(err) 466 return err 467 } 468 469 func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error { 470 agentCfg := a.CurrentConfig() 471 // At this stage, all API connections would have been closed 472 // We need to reopen the API to clear the reboot flag after 473 // scheduling the reboot. It may be cleaner to do this in the reboot 474 // worker, before returning the ErrRebootMachine. 475 st, _, err := OpenAPIState(agentCfg, a) 476 if err != nil { 477 logger.Infof("Reboot: Error connecting to state") 478 return errors.Trace(err) 479 } 480 // block until all units/containers are ready, and reboot/shutdown 481 finalize, err := reboot.NewRebootWaiter(st, agentCfg) 482 if err != nil { 483 return errors.Trace(err) 484 } 485 486 logger.Infof("Reboot: Executing reboot") 487 err = finalize.ExecuteReboot(action) 488 if err != nil { 489 logger.Infof("Reboot: Error executing reboot: %v", err) 490 return errors.Trace(err) 491 } 492 // On windows, the shutdown command is asynchronous. We return ErrRebootMachine 493 // so the agent will simply exit without error pending reboot/shutdown. 494 return worker.ErrRebootMachine 495 } 496 497 func (a *MachineAgent) ChangeConfig(mutate agent.ConfigMutator) error { 498 err := a.AgentConfigWriter.ChangeConfig(mutate) 499 a.configChangedVal.Set(struct{}{}) 500 if err != nil { 501 return errors.Trace(err) 502 } 503 return nil 504 } 505 506 // PrepareRestore will flag the agent to allow only a limited set 507 // of commands defined in 508 // "github.com/juju/juju/apiserver".allowedMethodsAboutToRestore 509 // the most noteworthy is: 510 // Backups.Restore: this will ensure that we can do all the file movements 511 // required for restore and no one will do changes while we do that. 512 // it will return error if the machine is already in this state. 513 func (a *MachineAgent) PrepareRestore() error { 514 if a.restoreMode { 515 return errors.Errorf("already in restore mode") 516 } 517 a.restoreMode = true 518 return nil 519 } 520 521 // BeginRestore will flag the agent to disallow all commands since 522 // restore should be running and therefore making changes that 523 // would override anything done. 524 func (a *MachineAgent) BeginRestore() error { 525 switch { 526 case !a.restoreMode: 527 return errors.Errorf("not in restore mode, cannot begin restoration") 528 case a.restoring: 529 return errors.Errorf("already restoring") 530 } 531 a.restoring = true 532 return nil 533 } 534 535 // EndRestore will flag the agent to allow all commands 536 // This being invoked means that restore process failed 537 // since success restarts the agent. 538 func (a *MachineAgent) EndRestore() { 539 a.restoreMode = false 540 a.restoring = false 541 } 542 543 // newRestoreStateWatcherWorker will return a worker or err if there 544 // is a failure, the worker takes care of watching the state of 545 // restoreInfo doc and put the agent in the different restore modes. 546 func (a *MachineAgent) newRestoreStateWatcherWorker(st *state.State) (worker.Worker, error) { 547 rWorker := func(stopch <-chan struct{}) error { 548 return a.restoreStateWatcher(st, stopch) 549 } 550 return worker.NewSimpleWorker(rWorker), nil 551 } 552 553 // restoreChanged will be called whenever restoreInfo doc changes signaling a new 554 // step in the restore process. 555 func (a *MachineAgent) restoreChanged(st *state.State) error { 556 rinfo, err := st.RestoreInfoSetter() 557 if err != nil { 558 return errors.Annotate(err, "cannot read restore state") 559 } 560 switch rinfo.Status() { 561 case state.RestorePending: 562 a.PrepareRestore() 563 case state.RestoreInProgress: 564 a.BeginRestore() 565 case state.RestoreFailed: 566 a.EndRestore() 567 } 568 return nil 569 } 570 571 // restoreStateWatcher watches for restoreInfo looking for changes in the restore process. 572 func (a *MachineAgent) restoreStateWatcher(st *state.State, stopch <-chan struct{}) error { 573 restoreWatch := st.WatchRestoreInfoChanges() 574 defer func() { 575 restoreWatch.Kill() 576 restoreWatch.Wait() 577 }() 578 579 for { 580 select { 581 case <-restoreWatch.Changes(): 582 if err := a.restoreChanged(st); err != nil { 583 return err 584 } 585 case <-stopch: 586 return nil 587 } 588 } 589 } 590 591 // newStateStarterWorker wraps stateStarter in a simple worker for use in 592 // a.runner.StartWorker. 593 func (a *MachineAgent) newStateStarterWorker() (worker.Worker, error) { 594 return worker.NewSimpleWorker(a.stateStarter), nil 595 } 596 597 // stateStarter watches for changes to the agent configuration, and 598 // starts or stops the state worker as appropriate. We watch the agent 599 // configuration because the agent configuration has all the details 600 // that we need to start a state server, whether they have been cached 601 // or read from the state. 602 // 603 // It will stop working as soon as stopch is closed. 604 func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error { 605 confWatch := a.configChangedVal.Watch() 606 defer confWatch.Close() 607 watchCh := make(chan struct{}) 608 go func() { 609 for confWatch.Next() { 610 watchCh <- struct{}{} 611 } 612 }() 613 for { 614 select { 615 case <-watchCh: 616 agentConfig := a.CurrentConfig() 617 618 // N.B. StartWorker and StopWorker are idempotent. 619 _, ok := agentConfig.StateServingInfo() 620 if ok { 621 a.runner.StartWorker("state", func() (worker.Worker, error) { 622 return a.StateWorker() 623 }) 624 } else { 625 a.runner.StopWorker("state") 626 } 627 case <-stopch: 628 return nil 629 } 630 } 631 } 632 633 // APIWorker returns a Worker that connects to the API and starts any 634 // workers that need an API connection. 635 func (a *MachineAgent) APIWorker() (_ worker.Worker, err error) { 636 agentConfig := a.CurrentConfig() 637 st, entity, err := OpenAPIState(agentConfig, a) 638 if err != nil { 639 return nil, err 640 } 641 reportOpenedAPI(st) 642 643 defer func() { 644 if err != nil { 645 st.Close() 646 reportClosedMachineAPI(st) 647 } 648 }() 649 650 // Refresh the configuration, since it may have been updated after opening state. 651 agentConfig = a.CurrentConfig() 652 for _, job := range entity.Jobs() { 653 if job.NeedsState() { 654 info, err := st.Agent().StateServingInfo() 655 if err != nil { 656 return nil, fmt.Errorf("cannot get state serving info: %v", err) 657 } 658 err = a.ChangeConfig(func(config agent.ConfigSetter) error { 659 config.SetStateServingInfo(info) 660 return nil 661 }) 662 if err != nil { 663 return nil, err 664 } 665 agentConfig = a.CurrentConfig() 666 break 667 } 668 } 669 670 // Before starting any workers, ensure we record the Juju version this machine 671 // agent is running. 672 currentTools := &coretools.Tools{Version: version.Current} 673 apiStateUpgrader := a.getUpgrader(st) 674 if err := apiStateUpgrader.SetVersion(agentConfig.Tag().String(), currentTools.Version); err != nil { 675 return nil, errors.Annotate(err, "cannot set machine agent version") 676 } 677 678 runner := newConnRunner(st) 679 680 // Run the agent upgrader and the upgrade-steps worker without waiting for 681 // the upgrade steps to complete. 682 runner.StartWorker("upgrader", a.agentUpgraderWorkerStarter(st.Upgrader(), agentConfig)) 683 runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter(st, entity.Jobs())) 684 685 // All other workers must wait for the upgrade steps to complete before starting. 686 a.startWorkerAfterUpgrade(runner, "api-post-upgrade", func() (worker.Worker, error) { 687 return a.postUpgradeAPIWorker(st, agentConfig, entity) 688 }) 689 return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 690 } 691 692 func (a *MachineAgent) postUpgradeAPIWorker( 693 st *api.State, 694 agentConfig agent.Config, 695 entity *apiagent.Entity, 696 ) (worker.Worker, error) { 697 698 var isEnvironManager bool 699 for _, job := range entity.Jobs() { 700 if job == multiwatcher.JobManageEnviron { 701 isEnvironManager = true 702 break 703 } 704 } 705 706 rsyslogMode := rsyslog.RsyslogModeForwarding 707 if isEnvironManager { 708 rsyslogMode = rsyslog.RsyslogModeAccumulate 709 } 710 711 runner := newConnRunner(st) 712 // TODO(fwereade): this is *still* a hideous layering violation, but at least 713 // it's confined to jujud rather than extending into the worker itself. 714 // Start this worker first to try and get proxy settings in place 715 // before we do anything else. 716 writeSystemFiles := shouldWriteProxyFiles(agentConfig) 717 runner.StartWorker("proxyupdater", func() (worker.Worker, error) { 718 return proxyupdater.New(st.Environment(), writeSystemFiles), nil 719 }) 720 721 if isEnvironManager { 722 runner.StartWorker("resumer", func() (worker.Worker, error) { 723 // The action of resumer is so subtle that it is not tested, 724 // because we can't figure out how to do so without 725 // brutalising the transaction log. 726 return newResumer(st.Resumer()), nil 727 }) 728 } 729 730 runner.StartWorker("machiner", func() (worker.Worker, error) { 731 accessor := machiner.APIMachineAccessor{st.Machiner()} 732 return machiner.NewMachiner(accessor, agentConfig), nil 733 }) 734 runner.StartWorker("reboot", func() (worker.Worker, error) { 735 reboot, err := st.Reboot() 736 if err != nil { 737 return nil, errors.Trace(err) 738 } 739 lock, err := cmdutil.HookExecutionLock(cmdutil.DataDir) 740 if err != nil { 741 return nil, errors.Trace(err) 742 } 743 return rebootworker.NewReboot(reboot, agentConfig, lock) 744 }) 745 runner.StartWorker("apiaddressupdater", func() (worker.Worker, error) { 746 return apiaddressupdater.NewAPIAddressUpdater(st.Machiner(), a.apiAddressSetter), nil 747 }) 748 runner.StartWorker("logger", func() (worker.Worker, error) { 749 return workerlogger.NewLogger(st.Logger(), agentConfig), nil 750 }) 751 752 runner.StartWorker("rsyslog", func() (worker.Worker, error) { 753 return cmdutil.NewRsyslogConfigWorker(st.Rsyslog(), agentConfig, rsyslogMode) 754 }) 755 756 if !isEnvironManager { 757 runner.StartWorker("stateconverter", func() (worker.Worker, error) { 758 return worker.NewNotifyWorker(conv2state.New(st.Machiner(), a)), nil 759 }) 760 } 761 762 runner.StartWorker("diskmanager", func() (worker.Worker, error) { 763 api, err := st.DiskManager() 764 if err != nil { 765 return nil, errors.Trace(err) 766 } 767 return newDiskManager(diskmanager.DefaultListBlockDevices, api), nil 768 }) 769 runner.StartWorker("storageprovisioner-machine", func() (worker.Worker, error) { 770 scope := agentConfig.Tag() 771 api := st.StorageProvisioner(scope) 772 storageDir := filepath.Join(agentConfig.DataDir(), "storage") 773 return newStorageWorker(scope, storageDir, api, api, api, api, api), nil 774 }) 775 776 // Check if the network management is disabled. 777 envConfig, err := st.Environment().EnvironConfig() 778 if err != nil { 779 return nil, fmt.Errorf("cannot read environment config: %v", err) 780 } 781 disableNetworkManagement, _ := envConfig.DisableNetworkManagement() 782 if disableNetworkManagement { 783 logger.Infof("network management is disabled") 784 } 785 786 // Start networker depending on configuration and job. 787 intrusiveMode := false 788 for _, job := range entity.Jobs() { 789 if job == multiwatcher.JobManageNetworking { 790 intrusiveMode = true 791 break 792 } 793 } 794 intrusiveMode = intrusiveMode && !disableNetworkManagement 795 runner.StartWorker("networker", func() (worker.Worker, error) { 796 return newNetworker(st.Networker(), agentConfig, intrusiveMode, networker.DefaultConfigBaseDir) 797 }) 798 799 // If not a local provider bootstrap machine, start the worker to 800 // manage SSH keys. 801 providerType := agentConfig.Value(agent.ProviderType) 802 if providerType != provider.Local || a.machineId != bootstrapMachineId { 803 runner.StartWorker("authenticationworker", func() (worker.Worker, error) { 804 return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil 805 }) 806 } 807 808 // Perform the operations needed to set up hosting for containers. 809 if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil { 810 cause := errors.Cause(err) 811 if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent { 812 return nil, worker.ErrTerminateAgent 813 } 814 return nil, fmt.Errorf("setting up container support: %v", err) 815 } 816 for _, job := range entity.Jobs() { 817 switch job { 818 case multiwatcher.JobHostUnits: 819 runner.StartWorker("deployer", func() (worker.Worker, error) { 820 apiDeployer := st.Deployer() 821 context := newDeployContext(apiDeployer, agentConfig) 822 return deployer.NewDeployer(apiDeployer, context), nil 823 }) 824 case multiwatcher.JobManageEnviron: 825 runner.StartWorker("identity-file-writer", func() (worker.Worker, error) { 826 inner := func(<-chan struct{}) error { 827 agentConfig := a.CurrentConfig() 828 return agent.WriteSystemIdentityFile(agentConfig) 829 } 830 return worker.NewSimpleWorker(inner), nil 831 }) 832 case multiwatcher.JobManageStateDeprecated: 833 // Legacy environments may set this, but we ignore it. 834 default: 835 // TODO(dimitern): Once all workers moved over to using 836 // the API, report "unknown job type" here. 837 } 838 } 839 840 return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 841 } 842 843 // Restart restarts the agent's service. 844 func (a *MachineAgent) Restart() error { 845 name := a.CurrentConfig().Value(agent.AgentServiceName) 846 return service.Restart(name) 847 } 848 849 func (a *MachineAgent) upgradeStepsWorkerStarter( 850 st *api.State, 851 jobs []multiwatcher.MachineJob, 852 ) func() (worker.Worker, error) { 853 return func() (worker.Worker, error) { 854 return a.upgradeWorkerContext.Worker(a, st, jobs), nil 855 } 856 } 857 858 func (a *MachineAgent) agentUpgraderWorkerStarter( 859 st *apiupgrader.State, 860 agentConfig agent.Config, 861 ) func() (worker.Worker, error) { 862 return func() (worker.Worker, error) { 863 return upgrader.NewAgentUpgrader( 864 st, 865 agentConfig, 866 a.previousAgentVersion, 867 a.upgradeWorkerContext.IsUpgradeRunning, 868 a.initialAgentUpgradeCheckComplete, 869 ), nil 870 } 871 } 872 873 // shouldWriteProxyFiles returns true, unless the supplied conf identifies the 874 // machine agent running directly on the host system in a local environment. 875 var shouldWriteProxyFiles = func(conf agent.Config) bool { 876 if conf.Value(agent.ProviderType) != provider.Local { 877 return true 878 } 879 return conf.Tag() != names.NewMachineTag(bootstrapMachineId) 880 } 881 882 // setupContainerSupport determines what containers can be run on this machine and 883 // initialises suitable infrastructure to support such containers. 884 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity, agentConfig agent.Config) error { 885 var supportedContainers []instance.ContainerType 886 // LXC containers are only supported on bare metal and fully virtualized linux systems 887 // Nested LXC containers and Windows machines cannot run LXC containers 888 supportsLXC, err := lxc.IsLXCSupported() 889 if err != nil { 890 logger.Warningf("no lxc containers possible: %v", err) 891 } 892 if err == nil && supportsLXC { 893 supportedContainers = append(supportedContainers, instance.LXC) 894 } 895 896 supportsKvm, err := kvm.IsKVMSupported() 897 if err != nil { 898 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 899 } 900 if err == nil && supportsKvm { 901 supportedContainers = append(supportedContainers, instance.KVM) 902 } 903 return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig) 904 } 905 906 // updateSupportedContainers records in state that a machine can run the specified containers. 907 // It starts a watcher and when a container of a given type is first added to the machine, 908 // the watcher is killed, the machine is set up to be able to start containers of the given type, 909 // and a suitable provisioner is started. 910 func (a *MachineAgent) updateSupportedContainers( 911 runner worker.Runner, 912 st *api.State, 913 machineTag string, 914 containers []instance.ContainerType, 915 agentConfig agent.Config, 916 ) error { 917 pr := st.Provisioner() 918 tag, err := names.ParseMachineTag(machineTag) 919 if err != nil { 920 return err 921 } 922 machine, err := pr.Machine(tag) 923 if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead { 924 return worker.ErrTerminateAgent 925 } 926 if err != nil { 927 return errors.Annotatef(err, "cannot load machine %s from state", tag) 928 } 929 if len(containers) == 0 { 930 if err := machine.SupportsNoContainers(); err != nil { 931 return errors.Annotatef(err, "clearing supported containers for %s", tag) 932 } 933 return nil 934 } 935 if err := machine.SetSupportedContainers(containers...); err != nil { 936 return errors.Annotatef(err, "setting supported containers for %s", tag) 937 } 938 initLock, err := cmdutil.HookExecutionLock(agentConfig.DataDir()) 939 if err != nil { 940 return err 941 } 942 // Start the watcher to fire when a container is first requested on the machine. 943 envUUID, err := st.EnvironTag() 944 if err != nil { 945 return err 946 } 947 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 948 // There may not be a CA certificate private key available, and without 949 // it we can't ensure that other Juju nodes can connect securely, so only 950 // use an image URL getter if there's a private key. 951 var imageURLGetter container.ImageURLGetter 952 if agentConfig.Value(agent.AllowsSecureConnection) == "true" { 953 imageURLGetter = container.NewImageURLGetter(st.Addr(), envUUID.Id(), []byte(agentConfig.CACert())) 954 } 955 params := provisioner.ContainerSetupParams{ 956 Runner: runner, 957 WorkerName: watcherName, 958 SupportedContainers: containers, 959 ImageURLGetter: imageURLGetter, 960 Machine: machine, 961 Provisioner: pr, 962 Config: agentConfig, 963 InitLock: initLock, 964 } 965 handler := provisioner.NewContainerSetupHandler(params) 966 a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) { 967 return worker.NewStringsWorker(handler), nil 968 }) 969 return nil 970 } 971 972 // StateWorker returns a worker running all the workers that require 973 // a *state.State connection. 974 func (a *MachineAgent) StateWorker() (worker.Worker, error) { 975 agentConfig := a.CurrentConfig() 976 977 // Start MongoDB server and dial. 978 if err := a.ensureMongoServer(agentConfig); err != nil { 979 return nil, err 980 } 981 st, m, err := openState(agentConfig, stateWorkerDialOpts) 982 if err != nil { 983 return nil, err 984 } 985 reportOpenedState(st) 986 987 stor := statestorage.NewStorage(st.EnvironUUID(), st.MongoSession()) 988 registerSimplestreamsDataSource(stor) 989 990 runner := newConnRunner(st) 991 singularRunner, err := newSingularStateRunner(runner, st, m) 992 if err != nil { 993 return nil, errors.Trace(err) 994 } 995 996 // Take advantage of special knowledge here in that we will only ever want 997 // the storage provider on one machine, and that is the "bootstrap" node. 998 providerType := agentConfig.Value(agent.ProviderType) 999 if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId { 1000 a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) { 1001 // TODO(axw) 2013-09-24 bug #1229507 1002 // Make another job to enable storage. 1003 // There's nothing special about this. 1004 return localstorage.NewWorker(agentConfig), nil 1005 }) 1006 } 1007 for _, job := range m.Jobs() { 1008 switch job { 1009 case state.JobHostUnits: 1010 // Implemented in APIWorker. 1011 case state.JobManageEnviron: 1012 useMultipleCPUs() 1013 a.startWorkerAfterUpgrade(runner, "env worker manager", func() (worker.Worker, error) { 1014 return envworkermanager.NewEnvWorkerManager(st, a.startEnvWorkers), nil 1015 }) 1016 a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) { 1017 return peergrouperNew(st) 1018 }) 1019 a.startWorkerAfterUpgrade(runner, "restore", func() (worker.Worker, error) { 1020 return a.newRestoreStateWatcherWorker(st) 1021 }) 1022 a.startWorkerAfterUpgrade(runner, "lease manager", func() (worker.Worker, error) { 1023 return lease.NewLeaseManager(st) 1024 }) 1025 certChangedChan := make(chan params.StateServingInfo, 1) 1026 runner.StartWorker("apiserver", a.apiserverWorkerStarter(st, certChangedChan)) 1027 var stateServingSetter certupdater.StateServingInfoSetter = func(info params.StateServingInfo, done <-chan struct{}) error { 1028 return a.ChangeConfig(func(config agent.ConfigSetter) error { 1029 config.SetStateServingInfo(info) 1030 logger.Infof("update apiserver worker with new certificate") 1031 select { 1032 case certChangedChan <- info: 1033 return nil 1034 case <-done: 1035 return nil 1036 } 1037 }) 1038 } 1039 a.startWorkerAfterUpgrade(runner, "certupdater", func() (worker.Worker, error) { 1040 return newCertificateUpdater(m, agentConfig, st, stateServingSetter, certChangedChan), nil 1041 }) 1042 1043 if featureflag.Enabled(feature.DbLog) { 1044 a.startWorkerAfterUpgrade(singularRunner, "dblogpruner", func() (worker.Worker, error) { 1045 return dblogpruner.New(st, dblogpruner.NewLogPruneParams()), nil 1046 }) 1047 } 1048 a.startWorkerAfterUpgrade(singularRunner, "statushistorypruner", func() (worker.Worker, error) { 1049 return statushistorypruner.New(st, statushistorypruner.NewHistoryPrunerParams()), nil 1050 }) 1051 1052 a.startWorkerAfterUpgrade(singularRunner, "txnpruner", func() (worker.Worker, error) { 1053 return txnpruner.New(st, time.Hour*2), nil 1054 }) 1055 1056 case state.JobManageStateDeprecated: 1057 // Legacy environments may set this, but we ignore it. 1058 default: 1059 logger.Warningf("ignoring unknown job %q", job) 1060 } 1061 } 1062 return cmdutil.NewCloseWorker(logger, runner, st), nil 1063 } 1064 1065 // startEnvWorkers starts state server workers that need to run per 1066 // environment. 1067 func (a *MachineAgent) startEnvWorkers( 1068 ssSt envworkermanager.InitialState, 1069 st *state.State, 1070 ) (_ worker.Worker, err error) { 1071 envUUID := st.EnvironUUID() 1072 defer errors.DeferredAnnotatef(&err, "failed to start workers for env %s", envUUID) 1073 logger.Infof("starting workers for env %s", envUUID) 1074 1075 // Establish API connection for this environment. 1076 agentConfig := a.CurrentConfig() 1077 apiInfo := agentConfig.APIInfo() 1078 apiInfo.EnvironTag = st.EnvironTag() 1079 apiSt, err := OpenAPIStateUsingInfo(apiInfo, a, agentConfig.OldPassword()) 1080 if err != nil { 1081 return nil, errors.Trace(err) 1082 } 1083 1084 // Create a runner for workers specific to this 1085 // environment. Either the State or API connection failing will be 1086 // considered fatal, killing the runner and all its workers. 1087 runner := newConnRunner(st, apiSt) 1088 defer func() { 1089 if err != nil && runner != nil { 1090 runner.Kill() 1091 runner.Wait() 1092 } 1093 }() 1094 // Close the API connection when the runner for this environment dies. 1095 go func() { 1096 runner.Wait() 1097 err := apiSt.Close() 1098 if err != nil { 1099 logger.Errorf("failed to close API connection for env %s: %v", envUUID, err) 1100 } 1101 }() 1102 1103 // Create a singular runner for this environment. 1104 machine, err := ssSt.Machine(a.machineId) 1105 if err != nil { 1106 return nil, errors.Trace(err) 1107 } 1108 singularRunner, err := newSingularStateRunner(runner, ssSt, machine) 1109 if err != nil { 1110 return nil, errors.Trace(err) 1111 } 1112 defer func() { 1113 if err != nil && singularRunner != nil { 1114 singularRunner.Kill() 1115 singularRunner.Wait() 1116 } 1117 }() 1118 1119 // Start workers that depend on a *state.State. 1120 // TODO(fwereade): 2015-04-21 THIS SHALL NOT PASS 1121 // Seriously, these should all be using the API. 1122 singularRunner.StartWorker("minunitsworker", func() (worker.Worker, error) { 1123 return minunitsworker.NewMinUnitsWorker(st), nil 1124 }) 1125 singularRunner.StartWorker("addresserworker", func() (worker.Worker, error) { 1126 return addresser.NewWorker(st) 1127 }) 1128 1129 // Start workers that use an API connection. 1130 singularRunner.StartWorker("environ-provisioner", func() (worker.Worker, error) { 1131 return provisioner.NewEnvironProvisioner(apiSt.Provisioner(), agentConfig), nil 1132 }) 1133 singularRunner.StartWorker("environ-storageprovisioner", func() (worker.Worker, error) { 1134 scope := st.EnvironTag() 1135 api := apiSt.StorageProvisioner(scope) 1136 return newStorageWorker(scope, "", api, api, api, api, api), nil 1137 }) 1138 singularRunner.StartWorker("charm-revision-updater", func() (worker.Worker, error) { 1139 return charmrevisionworker.NewRevisionUpdateWorker(apiSt.CharmRevisionUpdater()), nil 1140 }) 1141 runner.StartWorker("metricmanagerworker", func() (worker.Worker, error) { 1142 return metricworker.NewMetricsManager(getMetricAPI(apiSt)) 1143 }) 1144 singularRunner.StartWorker("instancepoller", func() (worker.Worker, error) { 1145 return newInstancePoller(apiSt.InstancePoller()), nil 1146 }) 1147 singularRunner.StartWorker("cleaner", func() (worker.Worker, error) { 1148 return newCleaner(apiSt.Cleaner()), nil 1149 }) 1150 1151 // TODO(axw) 2013-09-24 bug #1229506 1152 // Make another job to enable the firewaller. Not all 1153 // environments are capable of managing ports 1154 // centrally. 1155 fwMode, err := getFirewallMode(apiSt) 1156 if err != nil { 1157 return nil, errors.Annotate(err, "cannot get firewall mode") 1158 } 1159 if fwMode != config.FwNone { 1160 singularRunner.StartWorker("firewaller", func() (worker.Worker, error) { 1161 return newFirewaller(apiSt.Firewaller()) 1162 }) 1163 } else { 1164 logger.Debugf("not starting firewaller worker - firewall-mode is %q", fwMode) 1165 } 1166 1167 return runner, nil 1168 } 1169 1170 var getFirewallMode = _getFirewallMode 1171 1172 func _getFirewallMode(apiSt *api.State) (string, error) { 1173 envConfig, err := apiSt.Environment().EnvironConfig() 1174 if err != nil { 1175 return "", errors.Annotate(err, "cannot read environment config") 1176 } 1177 return envConfig.FirewallMode(), nil 1178 } 1179 1180 // stateWorkerDialOpts is a mongo.DialOpts suitable 1181 // for use by StateWorker to dial mongo. 1182 // 1183 // This must be overridden in tests, as it assumes 1184 // journaling is enabled. 1185 var stateWorkerDialOpts mongo.DialOpts 1186 1187 func (a *MachineAgent) apiserverWorkerStarter(st *state.State, certChanged chan params.StateServingInfo) func() (worker.Worker, error) { 1188 return func() (worker.Worker, error) { return a.newApiserverWorker(st, certChanged) } 1189 } 1190 1191 func (a *MachineAgent) newApiserverWorker(st *state.State, certChanged chan params.StateServingInfo) (worker.Worker, error) { 1192 agentConfig := a.CurrentConfig() 1193 // If the configuration does not have the required information, 1194 // it is currently not a recoverable error, so we kill the whole 1195 // agent, potentially enabling human intervention to fix 1196 // the agent's configuration file. 1197 info, ok := agentConfig.StateServingInfo() 1198 if !ok { 1199 return nil, &cmdutil.FatalError{"StateServingInfo not available and we need it"} 1200 } 1201 cert := []byte(info.Cert) 1202 key := []byte(info.PrivateKey) 1203 1204 if len(cert) == 0 || len(key) == 0 { 1205 return nil, &cmdutil.FatalError{"configuration does not have state server cert/key"} 1206 } 1207 tag := agentConfig.Tag() 1208 dataDir := agentConfig.DataDir() 1209 logDir := agentConfig.LogDir() 1210 1211 endpoint := net.JoinHostPort("", strconv.Itoa(info.APIPort)) 1212 listener, err := net.Listen("tcp", endpoint) 1213 if err != nil { 1214 return nil, err 1215 } 1216 return apiserver.NewServer(st, listener, apiserver.ServerConfig{ 1217 Cert: cert, 1218 Key: key, 1219 Tag: tag, 1220 DataDir: dataDir, 1221 LogDir: logDir, 1222 Validator: a.limitLogins, 1223 CertChanged: certChanged, 1224 }) 1225 } 1226 1227 // limitLogins is called by the API server for each login attempt. 1228 // it returns an error if upgrades or restore are running. 1229 func (a *MachineAgent) limitLogins(req params.LoginRequest) error { 1230 if err := a.limitLoginsDuringRestore(req); err != nil { 1231 return err 1232 } 1233 return a.limitLoginsDuringUpgrade(req) 1234 } 1235 1236 // limitLoginsDuringRestore will only allow logins for restore related purposes 1237 // while the different steps of restore are running. 1238 func (a *MachineAgent) limitLoginsDuringRestore(req params.LoginRequest) error { 1239 var err error 1240 switch { 1241 case a.IsRestoreRunning(): 1242 err = apiserver.RestoreInProgressError 1243 case a.IsRestorePreparing(): 1244 err = apiserver.AboutToRestoreError 1245 } 1246 if err != nil { 1247 authTag, parseErr := names.ParseTag(req.AuthTag) 1248 if parseErr != nil { 1249 return errors.Annotate(err, "could not parse auth tag") 1250 } 1251 switch authTag := authTag.(type) { 1252 case names.UserTag: 1253 // use a restricted API mode 1254 return err 1255 case names.MachineTag: 1256 if authTag == a.Tag() { 1257 // allow logins from the local machine 1258 return nil 1259 } 1260 } 1261 return errors.Errorf("login for %q blocked because restore is in progress", authTag) 1262 } 1263 return nil 1264 } 1265 1266 // limitLoginsDuringUpgrade is called by the API server for each login 1267 // attempt. It returns an error if upgrades are in progress unless the 1268 // login is for a user (i.e. a client) or the local machine. 1269 func (a *MachineAgent) limitLoginsDuringUpgrade(req params.LoginRequest) error { 1270 if a.upgradeWorkerContext.IsUpgradeRunning() || a.isAgentUpgradePending() { 1271 authTag, err := names.ParseTag(req.AuthTag) 1272 if err != nil { 1273 return errors.Annotate(err, "could not parse auth tag") 1274 } 1275 switch authTag := authTag.(type) { 1276 case names.UserTag: 1277 // use a restricted API mode 1278 return apiserver.UpgradeInProgressError 1279 case names.MachineTag: 1280 if authTag == a.Tag() { 1281 // allow logins from the local machine 1282 return nil 1283 } 1284 } 1285 return errors.Errorf("login for %q blocked because %s", authTag, apiserver.UpgradeInProgressError.Error()) 1286 } else { 1287 return nil // allow all logins 1288 } 1289 } 1290 1291 var stateWorkerServingConfigErr = errors.New("state worker started with no state serving info") 1292 1293 // ensureMongoServer ensures that mongo is installed and running, 1294 // and ready for opening a state connection. 1295 func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) (err error) { 1296 a.mongoInitMutex.Lock() 1297 defer a.mongoInitMutex.Unlock() 1298 if a.mongoInitialized { 1299 logger.Debugf("mongo is already initialized") 1300 return nil 1301 } 1302 defer func() { 1303 if err == nil { 1304 a.mongoInitialized = true 1305 } 1306 }() 1307 1308 // Many of the steps here, such as adding the state server to the 1309 // admin DB and initiating the replicaset, are once-only actions, 1310 // required when upgrading from a pre-HA-capable 1311 // environment. These calls won't do anything if the thing they 1312 // need to set up has already been done. 1313 var needReplicasetInit = false 1314 var machineAddrs []network.Address 1315 1316 mongoInstalled, err := mongo.IsServiceInstalled(agentConfig.Value(agent.Namespace)) 1317 if err != nil { 1318 return errors.Annotate(err, "error while checking if mongodb service is installed") 1319 } 1320 1321 if mongoInstalled { 1322 logger.Debugf("mongodb service is installed") 1323 1324 if _, err := a.ensureMongoAdminUser(agentConfig); err != nil { 1325 return errors.Trace(err) 1326 } 1327 1328 if err := a.ensureMongoSharedSecret(agentConfig); err != nil { 1329 return errors.Trace(err) 1330 } 1331 agentConfig = a.CurrentConfig() // ensureMongoSharedSecret may have updated the config 1332 1333 mongoInfo, ok := agentConfig.MongoInfo() 1334 if !ok { 1335 return errors.New("unable to retrieve mongo info to check replicaset") 1336 } 1337 1338 needReplicasetInit, err = isReplicasetInitNeeded(mongoInfo) 1339 if err != nil { 1340 return errors.Annotate(err, "error while checking replicaset") 1341 } 1342 1343 // If the replicaset is to be initialised the machine addresses 1344 // need to be retrieved *before* MongoDB is restarted with the 1345 // --replset option (in EnsureMongoServer). Once MongoDB is 1346 // started with --replset it won't respond to queries until the 1347 // replicaset is initiated. 1348 if needReplicasetInit { 1349 logger.Infof("replicaset not yet configured") 1350 machineAddrs, err = getMachineAddresses(agentConfig) 1351 if err != nil { 1352 return errors.Trace(err) 1353 } 1354 } 1355 } 1356 1357 // EnsureMongoServer installs/upgrades the init config as necessary. 1358 ensureServerParams, err := cmdutil.NewEnsureServerParams(agentConfig) 1359 if err != nil { 1360 return err 1361 } 1362 if err := cmdutil.EnsureMongoServer(ensureServerParams); err != nil { 1363 return err 1364 } 1365 1366 // Initiate the replicaset if required. 1367 if needReplicasetInit { 1368 servingInfo, ok := agentConfig.StateServingInfo() 1369 if !ok { 1370 return stateWorkerServingConfigErr 1371 } 1372 mongoInfo, ok := agentConfig.MongoInfo() 1373 if !ok { 1374 return errors.New("unable to retrieve mongo info to initiate replicaset") 1375 } 1376 if err := initiateReplicaSet(mongoInfo, servingInfo.StatePort, machineAddrs); err != nil { 1377 return err 1378 } 1379 } 1380 1381 return nil 1382 } 1383 1384 // ensureMongoAdminUser ensures that the machine's mongo user is in 1385 // the admin DB. 1386 func (a *MachineAgent) ensureMongoAdminUser(agentConfig agent.Config) (added bool, err error) { 1387 mongoInfo, ok1 := agentConfig.MongoInfo() 1388 servingInfo, ok2 := agentConfig.StateServingInfo() 1389 if !ok1 || !ok2 { 1390 return false, stateWorkerServingConfigErr 1391 } 1392 dialInfo, err := mongo.DialInfo(mongoInfo.Info, mongo.DefaultDialOpts()) 1393 if err != nil { 1394 return false, err 1395 } 1396 if len(dialInfo.Addrs) > 1 { 1397 logger.Infof("more than one state server; admin user must exist") 1398 return false, nil 1399 } 1400 return ensureMongoAdminUser(mongo.EnsureAdminUserParams{ 1401 DialInfo: dialInfo, 1402 Namespace: agentConfig.Value(agent.Namespace), 1403 DataDir: agentConfig.DataDir(), 1404 Port: servingInfo.StatePort, 1405 User: mongoInfo.Tag.String(), 1406 Password: mongoInfo.Password, 1407 }) 1408 } 1409 1410 // ensureMongoSharedSecret generates a MongoDB shared secret if 1411 // required, updating the agent's config and state. 1412 func (a *MachineAgent) ensureMongoSharedSecret(agentConfig agent.Config) error { 1413 servingInfo, ok := agentConfig.StateServingInfo() 1414 if !ok { 1415 return stateWorkerServingConfigErr 1416 } 1417 1418 if servingInfo.SharedSecret != "" { 1419 return nil // Already done 1420 } 1421 1422 logger.Infof("state serving info has no shared secret - generating") 1423 1424 var err error 1425 servingInfo.SharedSecret, err = mongo.GenerateSharedSecret() 1426 if err != nil { 1427 return err 1428 } 1429 logger.Debugf("updating state serving info in agent config") 1430 if err = a.ChangeConfig(func(config agent.ConfigSetter) error { 1431 config.SetStateServingInfo(servingInfo) 1432 return nil 1433 }); err != nil { 1434 return err 1435 } 1436 agentConfig = a.CurrentConfig() 1437 1438 logger.Debugf("updating state serving info in state") 1439 1440 // Note: we set Direct=true in the mongo options because it's 1441 // possible that we've previously upgraded the mongo server's 1442 // configuration to form a replicaset, but failed to initiate it. 1443 st, _, err := openState(agentConfig, mongo.DialOpts{Direct: true}) 1444 if err != nil { 1445 return err 1446 } 1447 defer st.Close() 1448 1449 ssi := cmdutil.ParamsStateServingInfoToStateStateServingInfo(servingInfo) 1450 if err := st.SetStateServingInfo(ssi); err != nil { 1451 return errors.Errorf("cannot set state serving info: %v", err) 1452 } 1453 1454 logger.Infof("shared secret updated in state serving info") 1455 return nil 1456 } 1457 1458 // isReplicasetInitNeeded returns true if the replicaset needs to be 1459 // initiated. 1460 func isReplicasetInitNeeded(mongoInfo *mongo.MongoInfo) (bool, error) { 1461 dialInfo, err := mongo.DialInfo(mongoInfo.Info, mongo.DefaultDialOpts()) 1462 if err != nil { 1463 return false, errors.Annotate(err, "cannot generate dial info to check replicaset") 1464 } 1465 dialInfo.Username = mongoInfo.Tag.String() 1466 dialInfo.Password = mongoInfo.Password 1467 1468 session, err := mgo.DialWithInfo(dialInfo) 1469 if err != nil { 1470 return false, errors.Annotate(err, "cannot dial mongo to check replicaset") 1471 } 1472 defer session.Close() 1473 1474 cfg, err := replicaset.CurrentConfig(session) 1475 if err != nil { 1476 logger.Debugf("couldn't retrieve replicaset config (not fatal): %v", err) 1477 return true, nil 1478 } 1479 numMembers := len(cfg.Members) 1480 logger.Debugf("replicaset member count: %d", numMembers) 1481 return numMembers < 1, nil 1482 } 1483 1484 // getMachineAddresses connects to state to determine the machine's 1485 // network addresses. 1486 func getMachineAddresses(agentConfig agent.Config) ([]network.Address, error) { 1487 logger.Debugf("opening state to get machine addresses") 1488 st, m, err := openState(agentConfig, mongo.DialOpts{Direct: true}) 1489 if err != nil { 1490 return nil, errors.Annotate(err, "failed to open state to retrieve machine addresses") 1491 } 1492 defer st.Close() 1493 return m.Addresses(), nil 1494 } 1495 1496 // initiateReplicaSet connects to MongoDB and sets up the replicaset. 1497 func initiateReplicaSet(mongoInfo *mongo.MongoInfo, statePort int, machineAddrs []network.Address) error { 1498 peerAddr := mongo.SelectPeerAddress(machineAddrs) 1499 if peerAddr == "" { 1500 return errors.Errorf("no appropriate peer address found in %q", machineAddrs) 1501 } 1502 1503 dialInfo, err := mongo.DialInfo(mongoInfo.Info, mongo.DefaultDialOpts()) 1504 if err != nil { 1505 return errors.Annotate(err, "cannot generate dial info to initiate replicaset") 1506 } 1507 1508 if err := maybeInitiateMongoServer(peergrouper.InitiateMongoParams{ 1509 DialInfo: dialInfo, 1510 MemberHostPort: net.JoinHostPort(peerAddr, fmt.Sprint(statePort)), 1511 User: mongoInfo.Tag.String(), // TODO(dfc) InitiateMongoParams should take a Tag 1512 Password: mongoInfo.Password, 1513 }); err != nil && err != peergrouper.ErrReplicaSetAlreadyInitiated { 1514 return err 1515 } 1516 return nil 1517 } 1518 1519 func openState(agentConfig agent.Config, dialOpts mongo.DialOpts) (_ *state.State, _ *state.Machine, err error) { 1520 info, ok := agentConfig.MongoInfo() 1521 if !ok { 1522 return nil, nil, fmt.Errorf("no state info available") 1523 } 1524 st, err := state.Open(info, dialOpts, environs.NewStatePolicy()) 1525 if err != nil { 1526 return nil, nil, err 1527 } 1528 defer func() { 1529 if err != nil { 1530 st.Close() 1531 } 1532 }() 1533 m0, err := st.FindEntity(agentConfig.Tag()) 1534 if err != nil { 1535 if errors.IsNotFound(err) { 1536 err = worker.ErrTerminateAgent 1537 } 1538 return nil, nil, err 1539 } 1540 m := m0.(*state.Machine) 1541 if m.Life() == state.Dead { 1542 return nil, nil, worker.ErrTerminateAgent 1543 } 1544 // Check the machine nonce as provisioned matches the agent.Conf value. 1545 if !m.CheckProvisioned(agentConfig.Nonce()) { 1546 // The agent is running on a different machine to the one it 1547 // should be according to state. It must stop immediately. 1548 logger.Errorf("running machine %v agent on inappropriate instance", m) 1549 return nil, nil, worker.ErrTerminateAgent 1550 } 1551 return st, m, nil 1552 } 1553 1554 // startWorkerAfterUpgrade starts a worker to run the specified child worker 1555 // but only after waiting for upgrades to complete. 1556 func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) { 1557 runner.StartWorker(name, func() (worker.Worker, error) { 1558 return a.upgradeWaiterWorker(name, start), nil 1559 }) 1560 } 1561 1562 // upgradeWaiterWorker runs the specified worker after upgrades have completed. 1563 func (a *MachineAgent) upgradeWaiterWorker(name string, start func() (worker.Worker, error)) worker.Worker { 1564 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 1565 // Wait for the agent upgrade and upgrade steps to complete (or for us to be stopped). 1566 for _, ch := range []chan struct{}{ 1567 a.upgradeWorkerContext.UpgradeComplete, 1568 a.initialAgentUpgradeCheckComplete, 1569 } { 1570 select { 1571 case <-stop: 1572 return nil 1573 case <-ch: 1574 } 1575 } 1576 logger.Debugf("upgrades done, starting worker %q", name) 1577 // Upgrades are done, start the worker. 1578 worker, err := start() 1579 if err != nil { 1580 return err 1581 } 1582 // Wait for worker to finish or for us to be stopped. 1583 waitCh := make(chan error) 1584 go func() { 1585 waitCh <- worker.Wait() 1586 }() 1587 select { 1588 case err := <-waitCh: 1589 logger.Debugf("worker %q exited with %v", name, err) 1590 return err 1591 case <-stop: 1592 logger.Debugf("stopping so killing worker %q", name) 1593 worker.Kill() 1594 } 1595 return <-waitCh // Ensure worker has stopped before returning. 1596 }) 1597 } 1598 1599 func (a *MachineAgent) setMachineStatus(apiState *api.State, status params.Status, info string) error { 1600 tag := a.Tag().(names.MachineTag) 1601 machine, err := apiState.Machiner().Machine(tag) 1602 if err != nil { 1603 return errors.Trace(err) 1604 } 1605 if err := machine.SetStatus(status, info, nil); err != nil { 1606 return errors.Trace(err) 1607 } 1608 return nil 1609 } 1610 1611 // WorkersStarted returns a channel that's closed once all top level workers 1612 // have been started. This is provided for testing purposes. 1613 func (a *MachineAgent) WorkersStarted() <-chan struct{} { 1614 return a.workersStarted 1615 } 1616 1617 func (a *MachineAgent) Tag() names.Tag { 1618 return names.NewMachineTag(a.machineId) 1619 } 1620 1621 func (a *MachineAgent) createJujuRun(dataDir string) error { 1622 // TODO do not remove the symlink if it already points 1623 // to the right place. 1624 if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) { 1625 return err 1626 } 1627 jujud := filepath.Join(dataDir, "tools", a.Tag().String(), jujunames.Jujud) 1628 return symlink.New(jujud, JujuRun) 1629 } 1630 1631 func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error { 1632 var errors []error 1633 agentServiceName := agentConfig.Value(agent.AgentServiceName) 1634 if agentServiceName == "" { 1635 // For backwards compatibility, handle lack of AgentServiceName. 1636 agentServiceName = os.Getenv("UPSTART_JOB") 1637 } 1638 if agentServiceName != "" { 1639 svc, err := service.DiscoverService(agentServiceName, common.Conf{}) 1640 if err != nil { 1641 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1642 } else if err := svc.Remove(); err != nil { 1643 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1644 } 1645 } 1646 // Remove the juju-run symlink. 1647 if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) { 1648 errors = append(errors, err) 1649 } 1650 1651 namespace := agentConfig.Value(agent.Namespace) 1652 if err := mongo.RemoveService(namespace); err != nil { 1653 errors = append(errors, fmt.Errorf("cannot stop/remove mongo service with namespace %q: %v", namespace, err)) 1654 } 1655 if err := os.RemoveAll(agentConfig.DataDir()); err != nil { 1656 errors = append(errors, err) 1657 } 1658 if len(errors) == 0 { 1659 return nil 1660 } 1661 return fmt.Errorf("uninstall failed: %v", errors) 1662 } 1663 1664 func newConnRunner(conns ...cmdutil.Pinger) worker.Runner { 1665 return worker.NewRunner(cmdutil.ConnectionIsFatal(logger, conns...), cmdutil.MoreImportant) 1666 } 1667 1668 type MongoSessioner interface { 1669 MongoSession() *mgo.Session 1670 } 1671 1672 func newSingularStateRunner(runner worker.Runner, st MongoSessioner, m *state.Machine) (worker.Runner, error) { 1673 singularStateConn := singularStateConn{st.MongoSession(), m} 1674 singularRunner, err := newSingularRunner(runner, singularStateConn) 1675 if err != nil { 1676 return nil, errors.Annotate(err, "cannot make singular State Runner") 1677 } 1678 return singularRunner, err 1679 } 1680 1681 // singularStateConn implements singular.Conn on 1682 // top of a State connection. 1683 type singularStateConn struct { 1684 session *mgo.Session 1685 machine *state.Machine 1686 } 1687 1688 func (c singularStateConn) IsMaster() (bool, error) { 1689 return mongo.IsMaster(c.session, c.machine) 1690 } 1691 1692 func (c singularStateConn) Ping() error { 1693 return c.session.Ping() 1694 } 1695 1696 func metricAPI(st *api.State) metricsmanager.MetricsManagerClient { 1697 return metricsmanager.NewClient(st) 1698 } 1699 1700 // newDeployContext gives the tests the opportunity to create a deployer.Context 1701 // that can be used for testing so as to avoid (1) deploying units to the system 1702 // running the tests and (2) get access to the *State used internally, so that 1703 // tests can be run without waiting for the 5s watcher refresh time to which we would 1704 // otherwise be restricted. 1705 var newDeployContext = func(st *apideployer.State, agentConfig agent.Config) deployer.Context { 1706 return deployer.NewSimpleContext(agentConfig, st) 1707 }