github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/cmd/jujud/agent/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package agent 5 6 import ( 7 "fmt" 8 "io" 9 "net" 10 "os" 11 "path/filepath" 12 "runtime" 13 "strconv" 14 "sync" 15 "time" 16 17 "github.com/juju/cmd" 18 "github.com/juju/errors" 19 "github.com/juju/loggo" 20 "github.com/juju/names" 21 "github.com/juju/replicaset" 22 "github.com/juju/utils" 23 "github.com/juju/utils/clock" 24 "github.com/juju/utils/featureflag" 25 "github.com/juju/utils/set" 26 "github.com/juju/utils/symlink" 27 "github.com/juju/utils/voyeur" 28 "gopkg.in/juju/charm.v5/charmrepo" 29 "gopkg.in/mgo.v2" 30 "gopkg.in/natefinch/lumberjack.v2" 31 "launchpad.net/gnuflag" 32 "launchpad.net/tomb" 33 34 "github.com/juju/juju/agent" 35 "github.com/juju/juju/api" 36 apiagent "github.com/juju/juju/api/agent" 37 apideployer "github.com/juju/juju/api/deployer" 38 "github.com/juju/juju/api/metricsmanager" 39 apiupgrader "github.com/juju/juju/api/upgrader" 40 "github.com/juju/juju/apiserver" 41 "github.com/juju/juju/apiserver/params" 42 "github.com/juju/juju/cert" 43 "github.com/juju/juju/cmd/jujud/reboot" 44 cmdutil "github.com/juju/juju/cmd/jujud/util" 45 "github.com/juju/juju/cmd/jujud/util/password" 46 "github.com/juju/juju/container" 47 "github.com/juju/juju/container/kvm" 48 "github.com/juju/juju/container/lxc" 49 "github.com/juju/juju/container/lxc/lxcutils" 50 "github.com/juju/juju/environs" 51 "github.com/juju/juju/environs/config" 52 "github.com/juju/juju/feature" 53 "github.com/juju/juju/instance" 54 jujunames "github.com/juju/juju/juju/names" 55 "github.com/juju/juju/juju/paths" 56 "github.com/juju/juju/mongo" 57 "github.com/juju/juju/network" 58 "github.com/juju/juju/provider" 59 "github.com/juju/juju/service" 60 "github.com/juju/juju/service/common" 61 "github.com/juju/juju/state" 62 "github.com/juju/juju/state/multiwatcher" 63 statestorage "github.com/juju/juju/state/storage" 64 "github.com/juju/juju/storage/looputil" 65 "github.com/juju/juju/version" 66 "github.com/juju/juju/worker" 67 "github.com/juju/juju/worker/addresser" 68 "github.com/juju/juju/worker/apiaddressupdater" 69 "github.com/juju/juju/worker/apicaller" 70 "github.com/juju/juju/worker/authenticationworker" 71 "github.com/juju/juju/worker/certupdater" 72 "github.com/juju/juju/worker/charmrevisionworker" 73 "github.com/juju/juju/worker/cleaner" 74 "github.com/juju/juju/worker/conv2state" 75 "github.com/juju/juju/worker/dblogpruner" 76 "github.com/juju/juju/worker/deployer" 77 "github.com/juju/juju/worker/diskmanager" 78 "github.com/juju/juju/worker/envworkermanager" 79 "github.com/juju/juju/worker/firewaller" 80 "github.com/juju/juju/worker/gate" 81 "github.com/juju/juju/worker/instancepoller" 82 "github.com/juju/juju/worker/localstorage" 83 workerlogger "github.com/juju/juju/worker/logger" 84 "github.com/juju/juju/worker/logsender" 85 "github.com/juju/juju/worker/machiner" 86 "github.com/juju/juju/worker/metricworker" 87 "github.com/juju/juju/worker/minunitsworker" 88 "github.com/juju/juju/worker/networker" 89 "github.com/juju/juju/worker/peergrouper" 90 "github.com/juju/juju/worker/provisioner" 91 "github.com/juju/juju/worker/proxyupdater" 92 rebootworker "github.com/juju/juju/worker/reboot" 93 "github.com/juju/juju/worker/resumer" 94 "github.com/juju/juju/worker/rsyslog" 95 "github.com/juju/juju/worker/singular" 96 "github.com/juju/juju/worker/statushistorypruner" 97 "github.com/juju/juju/worker/storageprovisioner" 98 "github.com/juju/juju/worker/terminationworker" 99 "github.com/juju/juju/worker/txnpruner" 100 "github.com/juju/juju/worker/upgrader" 101 ) 102 103 const bootstrapMachineId = "0" 104 105 var ( 106 logger = loggo.GetLogger("juju.cmd.jujud") 107 retryDelay = 3 * time.Second 108 JujuRun = paths.MustSucceed(paths.JujuRun(version.Current.Series)) 109 110 // The following are defined as variables to allow the tests to 111 // intercept calls to the functions. 112 useMultipleCPUs = utils.UseMultipleCPUs 113 maybeInitiateMongoServer = peergrouper.MaybeInitiateMongoServer 114 ensureMongoAdminUser = mongo.EnsureAdminUser 115 newSingularRunner = singular.New 116 peergrouperNew = peergrouper.New 117 newMachiner = machiner.NewMachiner 118 newNetworker = networker.NewNetworker 119 newFirewaller = firewaller.NewFirewaller 120 newDiskManager = diskmanager.NewWorker 121 newStorageWorker = storageprovisioner.NewStorageProvisioner 122 newCertificateUpdater = certupdater.NewCertificateUpdater 123 newResumer = resumer.NewResumer 124 newInstancePoller = instancepoller.NewWorker 125 newCleaner = cleaner.NewCleaner 126 newAddresser = addresser.NewWorker 127 reportOpenedState = func(io.Closer) {} 128 reportOpenedAPI = func(io.Closer) {} 129 getMetricAPI = metricAPI 130 ) 131 132 // Variable to override in tests, default is true 133 var ProductionMongoWriteConcern = true 134 135 func init() { 136 stateWorkerDialOpts = mongo.DefaultDialOpts() 137 stateWorkerDialOpts.PostDial = func(session *mgo.Session) error { 138 safe := mgo.Safe{} 139 if ProductionMongoWriteConcern { 140 safe.J = true 141 _, err := replicaset.CurrentConfig(session) 142 if err == nil { 143 // set mongo to write-majority (writes only returned after 144 // replicated to a majority of replica-set members). 145 safe.WMode = "majority" 146 } 147 } 148 session.SetSafe(&safe) 149 return nil 150 } 151 } 152 153 // AgentInitializer handles initializing a type for use as a Jujud 154 // agent. 155 type AgentInitializer interface { 156 AddFlags(*gnuflag.FlagSet) 157 CheckArgs([]string) error 158 } 159 160 // AgentConfigWriter encapsulates disk I/O operations with the agent 161 // config. 162 type AgentConfigWriter interface { 163 // ReadConfig reads the config for the given tag from disk. 164 ReadConfig(tag string) error 165 // ChangeConfig executes the given agent.ConfigMutator in a 166 // thread-safe context. 167 ChangeConfig(agent.ConfigMutator) error 168 // CurrentConfig returns a copy of the in-memory agent config. 169 CurrentConfig() agent.Config 170 } 171 172 // NewMachineAgentCmd creates a Command which handles parsing 173 // command-line arguments and instantiating and running a 174 // MachineAgent. 175 func NewMachineAgentCmd( 176 ctx *cmd.Context, 177 machineAgentFactory func(string) *MachineAgent, 178 agentInitializer AgentInitializer, 179 configFetcher AgentConfigWriter, 180 ) cmd.Command { 181 return &machineAgentCmd{ 182 ctx: ctx, 183 machineAgentFactory: machineAgentFactory, 184 agentInitializer: agentInitializer, 185 currentConfig: configFetcher, 186 } 187 } 188 189 type machineAgentCmd struct { 190 cmd.CommandBase 191 192 // This group of arguments is required. 193 agentInitializer AgentInitializer 194 currentConfig AgentConfigWriter 195 machineAgentFactory func(string) *MachineAgent 196 ctx *cmd.Context 197 198 // This group is for debugging purposes. 199 logToStdErr bool 200 201 // The following are set via command-line flags. 202 machineId string 203 } 204 205 // Init is called by the cmd system to initialize the structure for 206 // running. 207 func (a *machineAgentCmd) Init(args []string) error { 208 209 if !names.IsValidMachine(a.machineId) { 210 return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer") 211 } 212 if err := a.agentInitializer.CheckArgs(args); err != nil { 213 return err 214 } 215 216 // Due to changes in the logging, and needing to care about old 217 // environments that have been upgraded, we need to explicitly remove the 218 // file writer if one has been added, otherwise we will get duplicate 219 // lines of all logging in the log file. 220 loggo.RemoveWriter("logfile") 221 222 if a.logToStdErr { 223 return nil 224 } 225 226 err := a.currentConfig.ReadConfig(names.NewMachineTag(a.machineId).String()) 227 if err != nil { 228 return errors.Annotate(err, "cannot read agent configuration") 229 } 230 agentConfig := a.currentConfig.CurrentConfig() 231 232 // the context's stderr is set as the loggo writer in github.com/juju/cmd/logging.go 233 a.ctx.Stderr = &lumberjack.Logger{ 234 Filename: agent.LogFilename(agentConfig), 235 MaxSize: 300, // megabytes 236 MaxBackups: 2, 237 } 238 239 return nil 240 } 241 242 // Run instantiates a MachineAgent and runs it. 243 func (a *machineAgentCmd) Run(c *cmd.Context) error { 244 machineAgent := a.machineAgentFactory(a.machineId) 245 return machineAgent.Run(c) 246 } 247 248 // SetFlags adds the requisite flags to run this command. 249 func (a *machineAgentCmd) SetFlags(f *gnuflag.FlagSet) { 250 a.agentInitializer.AddFlags(f) 251 f.StringVar(&a.machineId, "machine-id", "", "id of the machine to run") 252 } 253 254 // Info returns usage information for the command. 255 func (a *machineAgentCmd) Info() *cmd.Info { 256 return &cmd.Info{ 257 Name: "machine", 258 Purpose: "run a juju machine agent", 259 } 260 } 261 262 // MachineAgentFactoryFn returns a function which instantiates a 263 // MachineAgent given a machineId. 264 func MachineAgentFactoryFn( 265 agentConfWriter AgentConfigWriter, 266 bufferedLogs logsender.LogRecordCh, 267 loopDeviceManager looputil.LoopDeviceManager, 268 ) func(string) *MachineAgent { 269 return func(machineId string) *MachineAgent { 270 return NewMachineAgent( 271 machineId, 272 agentConfWriter, 273 bufferedLogs, 274 NewUpgradeWorkerContext(), 275 worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant), 276 loopDeviceManager, 277 ) 278 } 279 } 280 281 // NewMachineAgent instantiates a new MachineAgent. 282 func NewMachineAgent( 283 machineId string, 284 agentConfWriter AgentConfigWriter, 285 bufferedLogs logsender.LogRecordCh, 286 upgradeWorkerContext *upgradeWorkerContext, 287 runner worker.Runner, 288 loopDeviceManager looputil.LoopDeviceManager, 289 ) *MachineAgent { 290 return &MachineAgent{ 291 machineId: machineId, 292 AgentConfigWriter: agentConfWriter, 293 bufferedLogs: bufferedLogs, 294 upgradeWorkerContext: upgradeWorkerContext, 295 workersStarted: make(chan struct{}), 296 runner: runner, 297 initialAgentUpgradeCheckComplete: make(chan struct{}), 298 loopDeviceManager: loopDeviceManager, 299 } 300 } 301 302 // MachineAgent is responsible for tying together all functionality 303 // needed to orchestrate a Jujud instance which controls a machine. 304 type MachineAgent struct { 305 AgentConfigWriter 306 307 tomb tomb.Tomb 308 machineId string 309 previousAgentVersion version.Number 310 runner worker.Runner 311 bufferedLogs logsender.LogRecordCh 312 configChangedVal voyeur.Value 313 upgradeWorkerContext *upgradeWorkerContext 314 workersStarted chan struct{} 315 316 // XXX(fwereade): these smell strongly of goroutine-unsafeness. 317 restoreMode bool 318 restoring bool 319 320 // Used to signal that the upgrade worker will not 321 // reboot the agent on startup because there are no 322 // longer any immediately pending agent upgrades. 323 // Channel used as a selectable bool (closed means true). 324 initialAgentUpgradeCheckComplete chan struct{} 325 326 mongoInitMutex sync.Mutex 327 mongoInitialized bool 328 329 loopDeviceManager looputil.LoopDeviceManager 330 } 331 332 // IsRestorePreparing returns bool representing if we are in restore mode 333 // but not running restore. 334 func (a *MachineAgent) IsRestorePreparing() bool { 335 return a.restoreMode && !a.restoring 336 } 337 338 // IsRestoreRunning returns bool representing if we are in restore mode 339 // and running the actual restore process. 340 func (a *MachineAgent) IsRestoreRunning() bool { 341 return a.restoring 342 } 343 344 func (a *MachineAgent) isAgentUpgradePending() bool { 345 select { 346 case <-a.initialAgentUpgradeCheckComplete: 347 return false 348 default: 349 return true 350 } 351 } 352 353 // Wait waits for the machine agent to finish. 354 func (a *MachineAgent) Wait() error { 355 return a.tomb.Wait() 356 } 357 358 // Stop stops the machine agent. 359 func (a *MachineAgent) Stop() error { 360 a.runner.Kill() 361 return a.tomb.Wait() 362 } 363 364 // Dying returns the channel that can be used to see if the machine 365 // agent is terminating. 366 func (a *MachineAgent) Dying() <-chan struct{} { 367 return a.tomb.Dying() 368 } 369 370 // upgradeCertificateDNSNames ensure that the state server certificate 371 // recorded in the agent config and also mongo server.pem contains the 372 // DNSNames entires required by Juju/ 373 func (a *MachineAgent) upgradeCertificateDNSNames() error { 374 agentConfig := a.CurrentConfig() 375 si, ok := agentConfig.StateServingInfo() 376 if !ok || si.CAPrivateKey == "" { 377 // No certificate information exists yet, nothing to do. 378 return nil 379 } 380 // Parse the current certificate to get the current dns names. 381 serverCert, err := cert.ParseCert(si.Cert) 382 if err != nil { 383 return err 384 } 385 update := false 386 dnsNames := set.NewStrings(serverCert.DNSNames...) 387 requiredDNSNames := []string{"local", "juju-apiserver", "juju-mongodb"} 388 for _, dnsName := range requiredDNSNames { 389 if dnsNames.Contains(dnsName) { 390 continue 391 } 392 dnsNames.Add(dnsName) 393 update = true 394 } 395 if !update { 396 return nil 397 } 398 // Write a new certificate to the mongo pem and agent config files. 399 si.Cert, si.PrivateKey, err = cert.NewDefaultServer(agentConfig.CACert(), si.CAPrivateKey, dnsNames.Values()) 400 if err != nil { 401 return err 402 } 403 if err := mongo.UpdateSSLKey(agentConfig.DataDir(), si.Cert, si.PrivateKey); err != nil { 404 return err 405 } 406 return a.AgentConfigWriter.ChangeConfig(func(config agent.ConfigSetter) error { 407 config.SetStateServingInfo(si) 408 return nil 409 }) 410 } 411 412 // Run runs a machine agent. 413 func (a *MachineAgent) Run(*cmd.Context) error { 414 415 defer a.tomb.Done() 416 if err := a.ReadConfig(a.Tag().String()); err != nil { 417 return fmt.Errorf("cannot read agent configuration: %v", err) 418 } 419 420 logger.Infof("machine agent %v start (%s [%s])", a.Tag(), version.Current, runtime.Compiler) 421 if flags := featureflag.String(); flags != "" { 422 logger.Warningf("developer feature flags enabled: %s", flags) 423 } 424 425 // Before doing anything else, we need to make sure the certificate generated for 426 // use by mongo to validate state server connections is correct. This needs to be done 427 // before any possible restart of the mongo service. 428 // See bug http://pad.lv/1434680 429 if err := a.upgradeCertificateDNSNames(); err != nil { 430 return errors.Annotate(err, "error upgrading server certificate") 431 } 432 433 agentConfig := a.CurrentConfig() 434 435 if err := a.upgradeWorkerContext.InitializeUsingAgent(a); err != nil { 436 return errors.Annotate(err, "error during upgradeWorkerContext initialisation") 437 } 438 a.configChangedVal.Set(struct{}{}) 439 a.previousAgentVersion = agentConfig.UpgradedToVersion() 440 441 network.InitializeFromConfig(agentConfig) 442 charmrepo.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache") 443 if err := a.createJujuRun(agentConfig.DataDir()); err != nil { 444 return fmt.Errorf("cannot create juju run symlink: %v", err) 445 } 446 a.runner.StartWorker("api", a.APIWorker) 447 a.runner.StartWorker("statestarter", a.newStateStarterWorker) 448 a.runner.StartWorker("termination", func() (worker.Worker, error) { 449 return terminationworker.NewWorker(), nil 450 }) 451 452 // At this point, all workers will have been configured to start 453 close(a.workersStarted) 454 err := a.runner.Wait() 455 switch err { 456 case worker.ErrTerminateAgent: 457 err = a.uninstallAgent(agentConfig) 458 case worker.ErrRebootMachine: 459 logger.Infof("Caught reboot error") 460 err = a.executeRebootOrShutdown(params.ShouldReboot) 461 case worker.ErrShutdownMachine: 462 logger.Infof("Caught shutdown error") 463 err = a.executeRebootOrShutdown(params.ShouldShutdown) 464 } 465 err = cmdutil.AgentDone(logger, err) 466 a.tomb.Kill(err) 467 return err 468 } 469 470 func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error { 471 agentCfg := a.CurrentConfig() 472 // At this stage, all API connections would have been closed 473 // We need to reopen the API to clear the reboot flag after 474 // scheduling the reboot. It may be cleaner to do this in the reboot 475 // worker, before returning the ErrRebootMachine. 476 st, _, err := apicaller.OpenAPIState(a) 477 if err != nil { 478 logger.Infof("Reboot: Error connecting to state") 479 return errors.Trace(err) 480 } 481 // block until all units/containers are ready, and reboot/shutdown 482 finalize, err := reboot.NewRebootWaiter(st, agentCfg) 483 if err != nil { 484 return errors.Trace(err) 485 } 486 487 logger.Infof("Reboot: Executing reboot") 488 err = finalize.ExecuteReboot(action) 489 if err != nil { 490 logger.Infof("Reboot: Error executing reboot: %v", err) 491 return errors.Trace(err) 492 } 493 // On windows, the shutdown command is asynchronous. We return ErrRebootMachine 494 // so the agent will simply exit without error pending reboot/shutdown. 495 return worker.ErrRebootMachine 496 } 497 498 func (a *MachineAgent) ChangeConfig(mutate agent.ConfigMutator) error { 499 err := a.AgentConfigWriter.ChangeConfig(mutate) 500 a.configChangedVal.Set(struct{}{}) 501 if err != nil { 502 return errors.Trace(err) 503 } 504 return nil 505 } 506 507 // PrepareRestore will flag the agent to allow only a limited set 508 // of commands defined in 509 // "github.com/juju/juju/apiserver".allowedMethodsAboutToRestore 510 // the most noteworthy is: 511 // Backups.Restore: this will ensure that we can do all the file movements 512 // required for restore and no one will do changes while we do that. 513 // it will return error if the machine is already in this state. 514 func (a *MachineAgent) PrepareRestore() error { 515 if a.restoreMode { 516 return errors.Errorf("already in restore mode") 517 } 518 a.restoreMode = true 519 return nil 520 } 521 522 // BeginRestore will flag the agent to disallow all commands since 523 // restore should be running and therefore making changes that 524 // would override anything done. 525 func (a *MachineAgent) BeginRestore() error { 526 switch { 527 case !a.restoreMode: 528 return errors.Errorf("not in restore mode, cannot begin restoration") 529 case a.restoring: 530 return errors.Errorf("already restoring") 531 } 532 a.restoring = true 533 return nil 534 } 535 536 // EndRestore will flag the agent to allow all commands 537 // This being invoked means that restore process failed 538 // since success restarts the agent. 539 func (a *MachineAgent) EndRestore() { 540 a.restoreMode = false 541 a.restoring = false 542 } 543 544 // newRestoreStateWatcherWorker will return a worker or err if there 545 // is a failure, the worker takes care of watching the state of 546 // restoreInfo doc and put the agent in the different restore modes. 547 func (a *MachineAgent) newRestoreStateWatcherWorker(st *state.State) (worker.Worker, error) { 548 rWorker := func(stopch <-chan struct{}) error { 549 return a.restoreStateWatcher(st, stopch) 550 } 551 return worker.NewSimpleWorker(rWorker), nil 552 } 553 554 // restoreChanged will be called whenever restoreInfo doc changes signaling a new 555 // step in the restore process. 556 func (a *MachineAgent) restoreChanged(st *state.State) error { 557 rinfo, err := st.RestoreInfoSetter() 558 if err != nil { 559 return errors.Annotate(err, "cannot read restore state") 560 } 561 switch rinfo.Status() { 562 case state.RestorePending: 563 a.PrepareRestore() 564 case state.RestoreInProgress: 565 a.BeginRestore() 566 case state.RestoreFailed: 567 a.EndRestore() 568 } 569 return nil 570 } 571 572 // restoreStateWatcher watches for restoreInfo looking for changes in the restore process. 573 func (a *MachineAgent) restoreStateWatcher(st *state.State, stopch <-chan struct{}) error { 574 restoreWatch := st.WatchRestoreInfoChanges() 575 defer func() { 576 restoreWatch.Kill() 577 restoreWatch.Wait() 578 }() 579 580 for { 581 select { 582 case <-restoreWatch.Changes(): 583 if err := a.restoreChanged(st); err != nil { 584 return err 585 } 586 case <-stopch: 587 return nil 588 } 589 } 590 } 591 592 // newStateStarterWorker wraps stateStarter in a simple worker for use in 593 // a.runner.StartWorker. 594 func (a *MachineAgent) newStateStarterWorker() (worker.Worker, error) { 595 return worker.NewSimpleWorker(a.stateStarter), nil 596 } 597 598 // stateStarter watches for changes to the agent configuration, and 599 // starts or stops the state worker as appropriate. We watch the agent 600 // configuration because the agent configuration has all the details 601 // that we need to start a state server, whether they have been cached 602 // or read from the state. 603 // 604 // It will stop working as soon as stopch is closed. 605 func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error { 606 confWatch := a.configChangedVal.Watch() 607 defer confWatch.Close() 608 watchCh := make(chan struct{}) 609 go func() { 610 for confWatch.Next() { 611 watchCh <- struct{}{} 612 } 613 }() 614 for { 615 select { 616 case <-watchCh: 617 agentConfig := a.CurrentConfig() 618 619 // N.B. StartWorker and StopWorker are idempotent. 620 _, ok := agentConfig.StateServingInfo() 621 if ok { 622 a.runner.StartWorker("state", func() (worker.Worker, error) { 623 return a.StateWorker() 624 }) 625 } else { 626 a.runner.StopWorker("state") 627 } 628 case <-stopch: 629 return nil 630 } 631 } 632 } 633 634 // APIWorker returns a Worker that connects to the API and starts any 635 // workers that need an API connection. 636 func (a *MachineAgent) APIWorker() (_ worker.Worker, err error) { 637 st, entity, err := apicaller.OpenAPIState(a) 638 if err != nil { 639 return nil, err 640 } 641 reportOpenedAPI(st) 642 643 defer func() { 644 // TODO(fwereade): this is not properly tested. Old tests were evil 645 // (dependent on injecting an error in a patched-out upgrader API 646 // that shouldn't even be used at this level)... so I just deleted 647 // them. Not a major worry: this whole method will become redundant 648 // when we switch to the dependency engine (and specifically use 649 // worker/apicaller to connect). 650 if err != nil { 651 if err := st.Close(); err != nil { 652 logger.Errorf("while closing API: %v", err) 653 } 654 } 655 }() 656 657 agentConfig := a.CurrentConfig() 658 for _, job := range entity.Jobs() { 659 if job.NeedsState() { 660 info, err := st.Agent().StateServingInfo() 661 if err != nil { 662 return nil, fmt.Errorf("cannot get state serving info: %v", err) 663 } 664 err = a.ChangeConfig(func(config agent.ConfigSetter) error { 665 config.SetStateServingInfo(info) 666 return nil 667 }) 668 if err != nil { 669 return nil, err 670 } 671 agentConfig = a.CurrentConfig() 672 break 673 } 674 } 675 676 runner := newConnRunner(st) 677 678 // Run the agent upgrader and the upgrade-steps worker without waiting for 679 // the upgrade steps to complete. 680 runner.StartWorker("upgrader", a.agentUpgraderWorkerStarter(st.Upgrader(), agentConfig)) 681 runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter(st, entity.Jobs())) 682 683 // All other workers must wait for the upgrade steps to complete before starting. 684 a.startWorkerAfterUpgrade(runner, "api-post-upgrade", func() (worker.Worker, error) { 685 return a.postUpgradeAPIWorker(st, agentConfig, entity) 686 }) 687 return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 688 } 689 690 func (a *MachineAgent) postUpgradeAPIWorker( 691 st api.Connection, 692 agentConfig agent.Config, 693 entity *apiagent.Entity, 694 ) (worker.Worker, error) { 695 696 var isEnvironManager bool 697 for _, job := range entity.Jobs() { 698 if job == multiwatcher.JobManageEnviron { 699 isEnvironManager = true 700 break 701 } 702 } 703 704 runner := newConnRunner(st) 705 706 // TODO(fwereade): this is *still* a hideous layering violation, but at least 707 // it's confined to jujud rather than extending into the worker itself. 708 // Start this worker first to try and get proxy settings in place 709 // before we do anything else. 710 writeSystemFiles := shouldWriteProxyFiles(agentConfig) 711 runner.StartWorker("proxyupdater", func() (worker.Worker, error) { 712 return proxyupdater.New(st.Environment(), writeSystemFiles), nil 713 }) 714 715 if isEnvironManager { 716 runner.StartWorker("resumer", func() (worker.Worker, error) { 717 // The action of resumer is so subtle that it is not tested, 718 // because we can't figure out how to do so without 719 // brutalising the transaction log. 720 return newResumer(st.Resumer()), nil 721 }) 722 } 723 724 if feature.IsDbLogEnabled() { 725 runner.StartWorker("logsender", func() (worker.Worker, error) { 726 return logsender.New(a.bufferedLogs, gate.AlreadyUnlocked{}, a), nil 727 }) 728 } 729 730 envConfig, err := st.Environment().EnvironConfig() 731 if err != nil { 732 return nil, fmt.Errorf("cannot read environment config: %v", err) 733 } 734 ignoreMachineAddresses, _ := envConfig.IgnoreMachineAddresses() 735 if ignoreMachineAddresses { 736 logger.Infof("machine addresses not used, only addresses from provider") 737 } 738 runner.StartWorker("machiner", func() (worker.Worker, error) { 739 accessor := machiner.APIMachineAccessor{st.Machiner()} 740 return newMachiner(accessor, agentConfig, ignoreMachineAddresses), nil 741 }) 742 runner.StartWorker("reboot", func() (worker.Worker, error) { 743 reboot, err := st.Reboot() 744 if err != nil { 745 return nil, errors.Trace(err) 746 } 747 lock, err := cmdutil.HookExecutionLock(cmdutil.DataDir) 748 if err != nil { 749 return nil, errors.Trace(err) 750 } 751 return rebootworker.NewReboot(reboot, agentConfig, lock) 752 }) 753 runner.StartWorker("apiaddressupdater", func() (worker.Worker, error) { 754 addressUpdater := agent.APIHostPortsSetter{a} 755 return apiaddressupdater.NewAPIAddressUpdater(st.Machiner(), addressUpdater), nil 756 }) 757 runner.StartWorker("logger", func() (worker.Worker, error) { 758 return workerlogger.NewLogger(st.Logger(), agentConfig), nil 759 }) 760 761 if !featureflag.Enabled(feature.DisableRsyslog) { 762 rsyslogMode := rsyslog.RsyslogModeForwarding 763 if isEnvironManager { 764 rsyslogMode = rsyslog.RsyslogModeAccumulate 765 } 766 767 runner.StartWorker("rsyslog", func() (worker.Worker, error) { 768 return cmdutil.NewRsyslogConfigWorker(st.Rsyslog(), agentConfig, rsyslogMode) 769 }) 770 } 771 772 if !isEnvironManager { 773 runner.StartWorker("stateconverter", func() (worker.Worker, error) { 774 return worker.NewNotifyWorker(conv2state.New(st.Machiner(), a)), nil 775 }) 776 } 777 778 runner.StartWorker("diskmanager", func() (worker.Worker, error) { 779 api, err := st.DiskManager() 780 if err != nil { 781 return nil, errors.Trace(err) 782 } 783 return newDiskManager(diskmanager.DefaultListBlockDevices, api), nil 784 }) 785 runner.StartWorker("storageprovisioner-machine", func() (worker.Worker, error) { 786 scope := agentConfig.Tag() 787 api := st.StorageProvisioner(scope) 788 storageDir := filepath.Join(agentConfig.DataDir(), "storage") 789 return newStorageWorker( 790 scope, storageDir, api, api, api, api, api, api, 791 clock.WallClock, 792 ), nil 793 }) 794 795 // Check if the network management is disabled. 796 disableNetworkManagement, _ := envConfig.DisableNetworkManagement() 797 if disableNetworkManagement { 798 logger.Infof("network management is disabled") 799 } 800 801 // Start networker depending on configuration and job. 802 intrusiveMode := false 803 for _, job := range entity.Jobs() { 804 if job == multiwatcher.JobManageNetworking { 805 intrusiveMode = true 806 break 807 } 808 } 809 intrusiveMode = intrusiveMode && !disableNetworkManagement 810 runner.StartWorker("networker", func() (worker.Worker, error) { 811 return newNetworker(st.Networker(), agentConfig, intrusiveMode, networker.DefaultConfigBaseDir) 812 }) 813 814 // If not a local provider bootstrap machine, start the worker to 815 // manage SSH keys. 816 providerType := agentConfig.Value(agent.ProviderType) 817 if providerType != provider.Local || a.machineId != bootstrapMachineId { 818 runner.StartWorker("authenticationworker", func() (worker.Worker, error) { 819 return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil 820 }) 821 } 822 823 // Perform the operations needed to set up hosting for containers. 824 if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil { 825 cause := errors.Cause(err) 826 if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent { 827 return nil, worker.ErrTerminateAgent 828 } 829 return nil, fmt.Errorf("setting up container support: %v", err) 830 } 831 for _, job := range entity.Jobs() { 832 switch job { 833 case multiwatcher.JobHostUnits: 834 runner.StartWorker("deployer", func() (worker.Worker, error) { 835 apiDeployer := st.Deployer() 836 context := newDeployContext(apiDeployer, agentConfig) 837 return deployer.NewDeployer(apiDeployer, context), nil 838 }) 839 case multiwatcher.JobManageEnviron: 840 runner.StartWorker("identity-file-writer", func() (worker.Worker, error) { 841 inner := func(<-chan struct{}) error { 842 agentConfig := a.CurrentConfig() 843 return agent.WriteSystemIdentityFile(agentConfig) 844 } 845 return worker.NewSimpleWorker(inner), nil 846 }) 847 case multiwatcher.JobManageStateDeprecated: 848 // Legacy environments may set this, but we ignore it. 849 default: 850 // TODO(dimitern): Once all workers moved over to using 851 // the API, report "unknown job type" here. 852 } 853 } 854 855 return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 856 } 857 858 // Restart restarts the agent's service. 859 func (a *MachineAgent) Restart() error { 860 name := a.CurrentConfig().Value(agent.AgentServiceName) 861 return service.Restart(name) 862 } 863 864 func (a *MachineAgent) upgradeStepsWorkerStarter( 865 st api.Connection, 866 jobs []multiwatcher.MachineJob, 867 ) func() (worker.Worker, error) { 868 return func() (worker.Worker, error) { 869 return a.upgradeWorkerContext.Worker(a, st, jobs), nil 870 } 871 } 872 873 func (a *MachineAgent) agentUpgraderWorkerStarter( 874 st *apiupgrader.State, 875 agentConfig agent.Config, 876 ) func() (worker.Worker, error) { 877 return func() (worker.Worker, error) { 878 return upgrader.NewAgentUpgrader( 879 st, 880 agentConfig, 881 a.previousAgentVersion, 882 a.upgradeWorkerContext.IsUpgradeRunning, 883 a.initialAgentUpgradeCheckComplete, 884 ), nil 885 } 886 } 887 888 // shouldWriteProxyFiles returns true, unless the supplied conf identifies the 889 // machine agent running directly on the host system in a local environment. 890 var shouldWriteProxyFiles = func(conf agent.Config) bool { 891 if conf.Value(agent.ProviderType) != provider.Local { 892 return true 893 } 894 return conf.Tag() != names.NewMachineTag(bootstrapMachineId) 895 } 896 897 // setupContainerSupport determines what containers can be run on this machine and 898 // initialises suitable infrastructure to support such containers. 899 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connection, entity *apiagent.Entity, agentConfig agent.Config) error { 900 var supportedContainers []instance.ContainerType 901 // LXC containers are only supported on bare metal and fully virtualized linux systems 902 // Nested LXC containers and Windows machines cannot run LXC containers 903 supportsLXC, err := lxc.IsLXCSupported() 904 if err != nil { 905 logger.Warningf("no lxc containers possible: %v", err) 906 } 907 if err == nil && supportsLXC { 908 supportedContainers = append(supportedContainers, instance.LXC) 909 } 910 911 supportsKvm, err := kvm.IsKVMSupported() 912 if err != nil { 913 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 914 } 915 if err == nil && supportsKvm { 916 supportedContainers = append(supportedContainers, instance.KVM) 917 } 918 return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig) 919 } 920 921 // updateSupportedContainers records in state that a machine can run the specified containers. 922 // It starts a watcher and when a container of a given type is first added to the machine, 923 // the watcher is killed, the machine is set up to be able to start containers of the given type, 924 // and a suitable provisioner is started. 925 func (a *MachineAgent) updateSupportedContainers( 926 runner worker.Runner, 927 st api.Connection, 928 machineTag string, 929 containers []instance.ContainerType, 930 agentConfig agent.Config, 931 ) error { 932 pr := st.Provisioner() 933 tag, err := names.ParseMachineTag(machineTag) 934 if err != nil { 935 return err 936 } 937 machine, err := pr.Machine(tag) 938 if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead { 939 return worker.ErrTerminateAgent 940 } 941 if err != nil { 942 return errors.Annotatef(err, "cannot load machine %s from state", tag) 943 } 944 if len(containers) == 0 { 945 if err := machine.SupportsNoContainers(); err != nil { 946 return errors.Annotatef(err, "clearing supported containers for %s", tag) 947 } 948 return nil 949 } 950 if err := machine.SetSupportedContainers(containers...); err != nil { 951 return errors.Annotatef(err, "setting supported containers for %s", tag) 952 } 953 initLock, err := cmdutil.HookExecutionLock(agentConfig.DataDir()) 954 if err != nil { 955 return err 956 } 957 // Start the watcher to fire when a container is first requested on the machine. 958 envUUID, err := st.EnvironTag() 959 if err != nil { 960 return err 961 } 962 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 963 // There may not be a CA certificate private key available, and without 964 // it we can't ensure that other Juju nodes can connect securely, so only 965 // use an image URL getter if there's a private key. 966 var imageURLGetter container.ImageURLGetter 967 if agentConfig.Value(agent.AllowsSecureConnection) == "true" { 968 imageURLGetter = container.NewImageURLGetter(st.Addr(), envUUID.Id(), []byte(agentConfig.CACert())) 969 } 970 params := provisioner.ContainerSetupParams{ 971 Runner: runner, 972 WorkerName: watcherName, 973 SupportedContainers: containers, 974 ImageURLGetter: imageURLGetter, 975 Machine: machine, 976 Provisioner: pr, 977 Config: agentConfig, 978 InitLock: initLock, 979 } 980 handler := provisioner.NewContainerSetupHandler(params) 981 a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) { 982 return worker.NewStringsWorker(handler), nil 983 }) 984 return nil 985 } 986 987 // StateWorker returns a worker running all the workers that require 988 // a *state.State connection. 989 func (a *MachineAgent) StateWorker() (worker.Worker, error) { 990 agentConfig := a.CurrentConfig() 991 992 // Start MongoDB server and dial. 993 if err := a.ensureMongoServer(agentConfig); err != nil { 994 return nil, err 995 } 996 st, m, err := openState(agentConfig, stateWorkerDialOpts) 997 if err != nil { 998 return nil, err 999 } 1000 reportOpenedState(st) 1001 1002 stor := statestorage.NewStorage(st.EnvironUUID(), st.MongoSession()) 1003 registerSimplestreamsDataSource(stor) 1004 1005 runner := newConnRunner(st) 1006 singularRunner, err := newSingularStateRunner(runner, st, m) 1007 if err != nil { 1008 return nil, errors.Trace(err) 1009 } 1010 1011 // Take advantage of special knowledge here in that we will only ever want 1012 // the storage provider on one machine, and that is the "bootstrap" node. 1013 providerType := agentConfig.Value(agent.ProviderType) 1014 if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId { 1015 a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) { 1016 // TODO(axw) 2013-09-24 bug #1229507 1017 // Make another job to enable storage. 1018 // There's nothing special about this. 1019 return localstorage.NewWorker(agentConfig), nil 1020 }) 1021 } 1022 for _, job := range m.Jobs() { 1023 switch job { 1024 case state.JobHostUnits: 1025 // Implemented in APIWorker. 1026 case state.JobManageEnviron: 1027 useMultipleCPUs() 1028 a.startWorkerAfterUpgrade(runner, "env worker manager", func() (worker.Worker, error) { 1029 return envworkermanager.NewEnvWorkerManager(st, a.startEnvWorkers), nil 1030 }) 1031 a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) { 1032 return peergrouperNew(st) 1033 }) 1034 a.startWorkerAfterUpgrade(runner, "restore", func() (worker.Worker, error) { 1035 return a.newRestoreStateWatcherWorker(st) 1036 }) 1037 certChangedChan := make(chan params.StateServingInfo, 1) 1038 runner.StartWorker("apiserver", a.apiserverWorkerStarter(st, certChangedChan)) 1039 var stateServingSetter certupdater.StateServingInfoSetter = func(info params.StateServingInfo, done <-chan struct{}) error { 1040 return a.ChangeConfig(func(config agent.ConfigSetter) error { 1041 config.SetStateServingInfo(info) 1042 logger.Infof("update apiserver worker with new certificate") 1043 select { 1044 case certChangedChan <- info: 1045 return nil 1046 case <-done: 1047 return nil 1048 } 1049 }) 1050 } 1051 a.startWorkerAfterUpgrade(runner, "certupdater", func() (worker.Worker, error) { 1052 return newCertificateUpdater(m, agentConfig, st, st, stateServingSetter, certChangedChan), nil 1053 }) 1054 1055 if feature.IsDbLogEnabled() { 1056 a.startWorkerAfterUpgrade(singularRunner, "dblogpruner", func() (worker.Worker, error) { 1057 return dblogpruner.New(st, dblogpruner.NewLogPruneParams()), nil 1058 }) 1059 } 1060 a.startWorkerAfterUpgrade(singularRunner, "statushistorypruner", func() (worker.Worker, error) { 1061 return statushistorypruner.New(st, statushistorypruner.NewHistoryPrunerParams()), nil 1062 }) 1063 1064 a.startWorkerAfterUpgrade(singularRunner, "txnpruner", func() (worker.Worker, error) { 1065 return txnpruner.New(st, time.Hour*2), nil 1066 }) 1067 1068 case state.JobManageStateDeprecated: 1069 // Legacy environments may set this, but we ignore it. 1070 default: 1071 logger.Warningf("ignoring unknown job %q", job) 1072 } 1073 } 1074 return cmdutil.NewCloseWorker(logger, runner, st), nil 1075 } 1076 1077 // startEnvWorkers starts state server workers that need to run per 1078 // environment. 1079 func (a *MachineAgent) startEnvWorkers( 1080 ssSt envworkermanager.InitialState, 1081 st *state.State, 1082 ) (_ worker.Worker, err error) { 1083 envUUID := st.EnvironUUID() 1084 defer errors.DeferredAnnotatef(&err, "failed to start workers for env %s", envUUID) 1085 logger.Infof("starting workers for env %s", envUUID) 1086 1087 // Establish API connection for this environment. 1088 agentConfig := a.CurrentConfig() 1089 apiInfo := agentConfig.APIInfo() 1090 apiInfo.EnvironTag = st.EnvironTag() 1091 apiSt, err := apicaller.OpenAPIStateUsingInfo(apiInfo, agentConfig.OldPassword()) 1092 if err != nil { 1093 return nil, errors.Trace(err) 1094 } 1095 1096 // Create a runner for workers specific to this 1097 // environment. Either the State or API connection failing will be 1098 // considered fatal, killing the runner and all its workers. 1099 runner := newConnRunner(st, apiSt) 1100 defer func() { 1101 if err != nil && runner != nil { 1102 runner.Kill() 1103 runner.Wait() 1104 } 1105 }() 1106 // Close the API connection when the runner for this environment dies. 1107 go func() { 1108 runner.Wait() 1109 err := apiSt.Close() 1110 if err != nil { 1111 logger.Errorf("failed to close API connection for env %s: %v", envUUID, err) 1112 } 1113 }() 1114 1115 // Create a singular runner for this environment. 1116 machine, err := ssSt.Machine(a.machineId) 1117 if err != nil { 1118 return nil, errors.Trace(err) 1119 } 1120 singularRunner, err := newSingularStateRunner(runner, ssSt, machine) 1121 if err != nil { 1122 return nil, errors.Trace(err) 1123 } 1124 defer func() { 1125 if err != nil && singularRunner != nil { 1126 singularRunner.Kill() 1127 singularRunner.Wait() 1128 } 1129 }() 1130 1131 // Start workers that depend on a *state.State. 1132 // TODO(fwereade): 2015-04-21 THIS SHALL NOT PASS 1133 // Seriously, these should all be using the API. 1134 singularRunner.StartWorker("minunitsworker", func() (worker.Worker, error) { 1135 return minunitsworker.NewMinUnitsWorker(st), nil 1136 }) 1137 1138 // Start workers that use an API connection. 1139 singularRunner.StartWorker("environ-provisioner", func() (worker.Worker, error) { 1140 return provisioner.NewEnvironProvisioner(apiSt.Provisioner(), agentConfig), nil 1141 }) 1142 singularRunner.StartWorker("environ-storageprovisioner", func() (worker.Worker, error) { 1143 scope := st.EnvironTag() 1144 api := apiSt.StorageProvisioner(scope) 1145 return newStorageWorker( 1146 scope, "", api, api, api, api, api, api, 1147 clock.WallClock, 1148 ), nil 1149 }) 1150 singularRunner.StartWorker("charm-revision-updater", func() (worker.Worker, error) { 1151 return charmrevisionworker.NewRevisionUpdateWorker(apiSt.CharmRevisionUpdater()), nil 1152 }) 1153 runner.StartWorker("metricmanagerworker", func() (worker.Worker, error) { 1154 return metricworker.NewMetricsManager(getMetricAPI(apiSt)) 1155 }) 1156 singularRunner.StartWorker("instancepoller", func() (worker.Worker, error) { 1157 return newInstancePoller(apiSt.InstancePoller()), nil 1158 }) 1159 singularRunner.StartWorker("cleaner", func() (worker.Worker, error) { 1160 return newCleaner(apiSt.Cleaner()), nil 1161 }) 1162 singularRunner.StartWorker("addresserworker", func() (worker.Worker, error) { 1163 return newAddresser(apiSt.Addresser()) 1164 }) 1165 1166 // TODO(axw) 2013-09-24 bug #1229506 1167 // Make another job to enable the firewaller. Not all 1168 // environments are capable of managing ports 1169 // centrally. 1170 fwMode, err := getFirewallMode(apiSt) 1171 if err != nil { 1172 return nil, errors.Annotate(err, "cannot get firewall mode") 1173 } 1174 if fwMode != config.FwNone { 1175 singularRunner.StartWorker("firewaller", func() (worker.Worker, error) { 1176 return newFirewaller(apiSt.Firewaller()) 1177 }) 1178 } else { 1179 logger.Debugf("not starting firewaller worker - firewall-mode is %q", fwMode) 1180 } 1181 1182 return runner, nil 1183 } 1184 1185 var getFirewallMode = _getFirewallMode 1186 1187 func _getFirewallMode(apiSt api.Connection) (string, error) { 1188 envConfig, err := apiSt.Environment().EnvironConfig() 1189 if err != nil { 1190 return "", errors.Annotate(err, "cannot read environment config") 1191 } 1192 return envConfig.FirewallMode(), nil 1193 } 1194 1195 // stateWorkerDialOpts is a mongo.DialOpts suitable 1196 // for use by StateWorker to dial mongo. 1197 // 1198 // This must be overridden in tests, as it assumes 1199 // journaling is enabled. 1200 var stateWorkerDialOpts mongo.DialOpts 1201 1202 func (a *MachineAgent) apiserverWorkerStarter(st *state.State, certChanged chan params.StateServingInfo) func() (worker.Worker, error) { 1203 return func() (worker.Worker, error) { return a.newApiserverWorker(st, certChanged) } 1204 } 1205 1206 func (a *MachineAgent) newApiserverWorker(st *state.State, certChanged chan params.StateServingInfo) (worker.Worker, error) { 1207 agentConfig := a.CurrentConfig() 1208 // If the configuration does not have the required information, 1209 // it is currently not a recoverable error, so we kill the whole 1210 // agent, potentially enabling human intervention to fix 1211 // the agent's configuration file. 1212 info, ok := agentConfig.StateServingInfo() 1213 if !ok { 1214 return nil, &cmdutil.FatalError{"StateServingInfo not available and we need it"} 1215 } 1216 cert := []byte(info.Cert) 1217 key := []byte(info.PrivateKey) 1218 1219 if len(cert) == 0 || len(key) == 0 { 1220 return nil, &cmdutil.FatalError{"configuration does not have state server cert/key"} 1221 } 1222 tag := agentConfig.Tag() 1223 dataDir := agentConfig.DataDir() 1224 logDir := agentConfig.LogDir() 1225 1226 endpoint := net.JoinHostPort("", strconv.Itoa(info.APIPort)) 1227 listener, err := net.Listen("tcp", endpoint) 1228 if err != nil { 1229 return nil, err 1230 } 1231 return apiserver.NewServer(st, listener, apiserver.ServerConfig{ 1232 Cert: cert, 1233 Key: key, 1234 Tag: tag, 1235 DataDir: dataDir, 1236 LogDir: logDir, 1237 Validator: a.limitLogins, 1238 CertChanged: certChanged, 1239 }) 1240 } 1241 1242 // limitLogins is called by the API server for each login attempt. 1243 // it returns an error if upgrades or restore are running. 1244 func (a *MachineAgent) limitLogins(req params.LoginRequest) error { 1245 if err := a.limitLoginsDuringRestore(req); err != nil { 1246 return err 1247 } 1248 return a.limitLoginsDuringUpgrade(req) 1249 } 1250 1251 // limitLoginsDuringRestore will only allow logins for restore related purposes 1252 // while the different steps of restore are running. 1253 func (a *MachineAgent) limitLoginsDuringRestore(req params.LoginRequest) error { 1254 var err error 1255 switch { 1256 case a.IsRestoreRunning(): 1257 err = apiserver.RestoreInProgressError 1258 case a.IsRestorePreparing(): 1259 err = apiserver.AboutToRestoreError 1260 } 1261 if err != nil { 1262 authTag, parseErr := names.ParseTag(req.AuthTag) 1263 if parseErr != nil { 1264 return errors.Annotate(err, "could not parse auth tag") 1265 } 1266 switch authTag := authTag.(type) { 1267 case names.UserTag: 1268 // use a restricted API mode 1269 return err 1270 case names.MachineTag: 1271 if authTag == a.Tag() { 1272 // allow logins from the local machine 1273 return nil 1274 } 1275 } 1276 return errors.Errorf("login for %q blocked because restore is in progress", authTag) 1277 } 1278 return nil 1279 } 1280 1281 // limitLoginsDuringUpgrade is called by the API server for each login 1282 // attempt. It returns an error if upgrades are in progress unless the 1283 // login is for a user (i.e. a client) or the local machine. 1284 func (a *MachineAgent) limitLoginsDuringUpgrade(req params.LoginRequest) error { 1285 if a.upgradeWorkerContext.IsUpgradeRunning() || a.isAgentUpgradePending() { 1286 authTag, err := names.ParseTag(req.AuthTag) 1287 if err != nil { 1288 return errors.Annotate(err, "could not parse auth tag") 1289 } 1290 switch authTag := authTag.(type) { 1291 case names.UserTag: 1292 // use a restricted API mode 1293 return apiserver.UpgradeInProgressError 1294 case names.MachineTag: 1295 if authTag == a.Tag() { 1296 // allow logins from the local machine 1297 return nil 1298 } 1299 } 1300 return errors.Errorf("login for %q blocked because %s", authTag, apiserver.UpgradeInProgressError.Error()) 1301 } else { 1302 return nil // allow all logins 1303 } 1304 } 1305 1306 var stateWorkerServingConfigErr = errors.New("state worker started with no state serving info") 1307 1308 // ensureMongoServer ensures that mongo is installed and running, 1309 // and ready for opening a state connection. 1310 func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) (err error) { 1311 a.mongoInitMutex.Lock() 1312 defer a.mongoInitMutex.Unlock() 1313 if a.mongoInitialized { 1314 logger.Debugf("mongo is already initialized") 1315 return nil 1316 } 1317 defer func() { 1318 if err == nil { 1319 a.mongoInitialized = true 1320 } 1321 }() 1322 1323 // Many of the steps here, such as adding the state server to the 1324 // admin DB and initiating the replicaset, are once-only actions, 1325 // required when upgrading from a pre-HA-capable 1326 // environment. These calls won't do anything if the thing they 1327 // need to set up has already been done. 1328 var needReplicasetInit = false 1329 var machineAddrs []network.Address 1330 1331 mongoInstalled, err := mongo.IsServiceInstalled(agentConfig.Value(agent.Namespace)) 1332 if err != nil { 1333 return errors.Annotate(err, "error while checking if mongodb service is installed") 1334 } 1335 1336 if mongoInstalled { 1337 logger.Debugf("mongodb service is installed") 1338 1339 if _, err := a.ensureMongoAdminUser(agentConfig); err != nil { 1340 return errors.Trace(err) 1341 } 1342 1343 if err := a.ensureMongoSharedSecret(agentConfig); err != nil { 1344 return errors.Trace(err) 1345 } 1346 agentConfig = a.CurrentConfig() // ensureMongoSharedSecret may have updated the config 1347 1348 mongoInfo, ok := agentConfig.MongoInfo() 1349 if !ok { 1350 return errors.New("unable to retrieve mongo info to check replicaset") 1351 } 1352 1353 needReplicasetInit, err = isReplicasetInitNeeded(mongoInfo) 1354 if err != nil { 1355 return errors.Annotate(err, "error while checking replicaset") 1356 } 1357 1358 // If the replicaset is to be initialised the machine addresses 1359 // need to be retrieved *before* MongoDB is restarted with the 1360 // --replset option (in EnsureMongoServer). Once MongoDB is 1361 // started with --replset it won't respond to queries until the 1362 // replicaset is initiated. 1363 if needReplicasetInit { 1364 logger.Infof("replicaset not yet configured") 1365 machineAddrs, err = getMachineAddresses(agentConfig) 1366 if err != nil { 1367 return errors.Trace(err) 1368 } 1369 } 1370 } 1371 1372 // EnsureMongoServer installs/upgrades the init config as necessary. 1373 ensureServerParams, err := cmdutil.NewEnsureServerParams(agentConfig) 1374 if err != nil { 1375 return err 1376 } 1377 if err := cmdutil.EnsureMongoServer(ensureServerParams); err != nil { 1378 return err 1379 } 1380 1381 // Initiate the replicaset if required. 1382 if needReplicasetInit { 1383 servingInfo, ok := agentConfig.StateServingInfo() 1384 if !ok { 1385 return stateWorkerServingConfigErr 1386 } 1387 mongoInfo, ok := agentConfig.MongoInfo() 1388 if !ok { 1389 return errors.New("unable to retrieve mongo info to initiate replicaset") 1390 } 1391 if err := initiateReplicaSet(mongoInfo, servingInfo.StatePort, machineAddrs); err != nil { 1392 return err 1393 } 1394 } 1395 1396 return nil 1397 } 1398 1399 // ensureMongoAdminUser ensures that the machine's mongo user is in 1400 // the admin DB. 1401 func (a *MachineAgent) ensureMongoAdminUser(agentConfig agent.Config) (added bool, err error) { 1402 mongoInfo, ok1 := agentConfig.MongoInfo() 1403 servingInfo, ok2 := agentConfig.StateServingInfo() 1404 if !ok1 || !ok2 { 1405 return false, stateWorkerServingConfigErr 1406 } 1407 dialInfo, err := mongo.DialInfo(mongoInfo.Info, mongo.DefaultDialOpts()) 1408 if err != nil { 1409 return false, err 1410 } 1411 if len(dialInfo.Addrs) > 1 { 1412 logger.Infof("more than one state server; admin user must exist") 1413 return false, nil 1414 } 1415 return ensureMongoAdminUser(mongo.EnsureAdminUserParams{ 1416 DialInfo: dialInfo, 1417 Namespace: agentConfig.Value(agent.Namespace), 1418 DataDir: agentConfig.DataDir(), 1419 Port: servingInfo.StatePort, 1420 User: mongoInfo.Tag.String(), 1421 Password: mongoInfo.Password, 1422 }) 1423 } 1424 1425 // ensureMongoSharedSecret generates a MongoDB shared secret if 1426 // required, updating the agent's config and state. 1427 func (a *MachineAgent) ensureMongoSharedSecret(agentConfig agent.Config) error { 1428 servingInfo, ok := agentConfig.StateServingInfo() 1429 if !ok { 1430 return stateWorkerServingConfigErr 1431 } 1432 1433 if servingInfo.SharedSecret != "" { 1434 return nil // Already done 1435 } 1436 1437 logger.Infof("state serving info has no shared secret - generating") 1438 1439 var err error 1440 servingInfo.SharedSecret, err = mongo.GenerateSharedSecret() 1441 if err != nil { 1442 return err 1443 } 1444 logger.Debugf("updating state serving info in agent config") 1445 if err = a.ChangeConfig(func(config agent.ConfigSetter) error { 1446 config.SetStateServingInfo(servingInfo) 1447 return nil 1448 }); err != nil { 1449 return err 1450 } 1451 agentConfig = a.CurrentConfig() 1452 1453 logger.Debugf("updating state serving info in state") 1454 1455 // Note: we set Direct=true in the mongo options because it's 1456 // possible that we've previously upgraded the mongo server's 1457 // configuration to form a replicaset, but failed to initiate it. 1458 dialOpts := mongo.DefaultDialOpts() 1459 dialOpts.Direct = true 1460 st, _, err := openState(agentConfig, dialOpts) 1461 if err != nil { 1462 return err 1463 } 1464 defer st.Close() 1465 1466 ssi := cmdutil.ParamsStateServingInfoToStateStateServingInfo(servingInfo) 1467 if err := st.SetStateServingInfo(ssi); err != nil { 1468 return errors.Errorf("cannot set state serving info: %v", err) 1469 } 1470 1471 logger.Infof("shared secret updated in state serving info") 1472 return nil 1473 } 1474 1475 // isReplicasetInitNeeded returns true if the replicaset needs to be 1476 // initiated. 1477 func isReplicasetInitNeeded(mongoInfo *mongo.MongoInfo) (bool, error) { 1478 dialInfo, err := mongo.DialInfo(mongoInfo.Info, mongo.DefaultDialOpts()) 1479 if err != nil { 1480 return false, errors.Annotate(err, "cannot generate dial info to check replicaset") 1481 } 1482 dialInfo.Username = mongoInfo.Tag.String() 1483 dialInfo.Password = mongoInfo.Password 1484 1485 session, err := mgo.DialWithInfo(dialInfo) 1486 if err != nil { 1487 return false, errors.Annotate(err, "cannot dial mongo to check replicaset") 1488 } 1489 defer session.Close() 1490 1491 cfg, err := replicaset.CurrentConfig(session) 1492 if err != nil { 1493 logger.Debugf("couldn't retrieve replicaset config (not fatal): %v", err) 1494 return true, nil 1495 } 1496 numMembers := len(cfg.Members) 1497 logger.Debugf("replicaset member count: %d", numMembers) 1498 return numMembers < 1, nil 1499 } 1500 1501 // getMachineAddresses connects to state to determine the machine's 1502 // network addresses. 1503 func getMachineAddresses(agentConfig agent.Config) ([]network.Address, error) { 1504 logger.Debugf("opening state to get machine addresses") 1505 dialOpts := mongo.DefaultDialOpts() 1506 dialOpts.Direct = true 1507 st, m, err := openState(agentConfig, dialOpts) 1508 if err != nil { 1509 return nil, errors.Annotate(err, "failed to open state to retrieve machine addresses") 1510 } 1511 defer st.Close() 1512 return m.Addresses(), nil 1513 } 1514 1515 // initiateReplicaSet connects to MongoDB and sets up the replicaset. 1516 func initiateReplicaSet(mongoInfo *mongo.MongoInfo, statePort int, machineAddrs []network.Address) error { 1517 peerAddr := mongo.SelectPeerAddress(machineAddrs) 1518 if peerAddr == "" { 1519 return errors.Errorf("no appropriate peer address found in %q", machineAddrs) 1520 } 1521 1522 dialInfo, err := mongo.DialInfo(mongoInfo.Info, mongo.DefaultDialOpts()) 1523 if err != nil { 1524 return errors.Annotate(err, "cannot generate dial info to initiate replicaset") 1525 } 1526 1527 if err := maybeInitiateMongoServer(peergrouper.InitiateMongoParams{ 1528 DialInfo: dialInfo, 1529 MemberHostPort: net.JoinHostPort(peerAddr, fmt.Sprint(statePort)), 1530 User: mongoInfo.Tag.String(), // TODO(dfc) InitiateMongoParams should take a Tag 1531 Password: mongoInfo.Password, 1532 }); err != nil && err != peergrouper.ErrReplicaSetAlreadyInitiated { 1533 return err 1534 } 1535 return nil 1536 } 1537 1538 func openState(agentConfig agent.Config, dialOpts mongo.DialOpts) (_ *state.State, _ *state.Machine, err error) { 1539 info, ok := agentConfig.MongoInfo() 1540 if !ok { 1541 return nil, nil, fmt.Errorf("no state info available") 1542 } 1543 st, err := state.Open(agentConfig.Environment(), info, dialOpts, environs.NewStatePolicy()) 1544 if err != nil { 1545 return nil, nil, err 1546 } 1547 defer func() { 1548 if err != nil { 1549 st.Close() 1550 } 1551 }() 1552 m0, err := st.FindEntity(agentConfig.Tag()) 1553 if err != nil { 1554 if errors.IsNotFound(err) { 1555 err = worker.ErrTerminateAgent 1556 } 1557 return nil, nil, err 1558 } 1559 m := m0.(*state.Machine) 1560 if m.Life() == state.Dead { 1561 return nil, nil, worker.ErrTerminateAgent 1562 } 1563 // Check the machine nonce as provisioned matches the agent.Conf value. 1564 if !m.CheckProvisioned(agentConfig.Nonce()) { 1565 // The agent is running on a different machine to the one it 1566 // should be according to state. It must stop immediately. 1567 logger.Errorf("running machine %v agent on inappropriate instance", m) 1568 return nil, nil, worker.ErrTerminateAgent 1569 } 1570 return st, m, nil 1571 } 1572 1573 // startWorkerAfterUpgrade starts a worker to run the specified child worker 1574 // but only after waiting for upgrades to complete. 1575 func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) { 1576 runner.StartWorker(name, func() (worker.Worker, error) { 1577 return a.upgradeWaiterWorker(name, start), nil 1578 }) 1579 } 1580 1581 // upgradeWaiterWorker runs the specified worker after upgrades have completed. 1582 func (a *MachineAgent) upgradeWaiterWorker(name string, start func() (worker.Worker, error)) worker.Worker { 1583 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 1584 // Wait for the agent upgrade and upgrade steps to complete (or for us to be stopped). 1585 for _, ch := range []chan struct{}{ 1586 a.upgradeWorkerContext.UpgradeComplete, 1587 a.initialAgentUpgradeCheckComplete, 1588 } { 1589 select { 1590 case <-stop: 1591 return nil 1592 case <-ch: 1593 } 1594 } 1595 logger.Debugf("upgrades done, starting worker %q", name) 1596 1597 // For windows clients we need to make sure we set a random password in a 1598 // registry file and use that password for the jujud user and its services 1599 // before starting anything else. 1600 // Services on windows need to know the user's password to start up. The only 1601 // way to store that password securely is if the user running the services 1602 // sets the password. This cannot be done during cloud-init so it is done here. 1603 // This needs to get ran in between finishing the upgrades and starting 1604 // the rest of the workers(in particular the deployer which should use 1605 // the new password) 1606 if err := password.EnsureJujudPassword(); err != nil { 1607 return errors.Annotate(err, "Could not ensure jujud password") 1608 } 1609 1610 // Upgrades are done, start the worker. 1611 worker, err := start() 1612 if err != nil { 1613 return err 1614 } 1615 // Wait for worker to finish or for us to be stopped. 1616 waitCh := make(chan error) 1617 go func() { 1618 waitCh <- worker.Wait() 1619 }() 1620 select { 1621 case err := <-waitCh: 1622 logger.Debugf("worker %q exited with %v", name, err) 1623 return err 1624 case <-stop: 1625 logger.Debugf("stopping so killing worker %q", name) 1626 worker.Kill() 1627 } 1628 return <-waitCh // Ensure worker has stopped before returning. 1629 }) 1630 } 1631 1632 func (a *MachineAgent) setMachineStatus(apiState api.Connection, status params.Status, info string) error { 1633 tag := a.Tag().(names.MachineTag) 1634 machine, err := apiState.Machiner().Machine(tag) 1635 if err != nil { 1636 return errors.Trace(err) 1637 } 1638 if err := machine.SetStatus(status, info, nil); err != nil { 1639 return errors.Trace(err) 1640 } 1641 return nil 1642 } 1643 1644 // WorkersStarted returns a channel that's closed once all top level workers 1645 // have been started. This is provided for testing purposes. 1646 func (a *MachineAgent) WorkersStarted() <-chan struct{} { 1647 return a.workersStarted 1648 } 1649 1650 func (a *MachineAgent) Tag() names.Tag { 1651 return names.NewMachineTag(a.machineId) 1652 } 1653 1654 func (a *MachineAgent) createJujuRun(dataDir string) error { 1655 // TODO do not remove the symlink if it already points 1656 // to the right place. 1657 if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) { 1658 return err 1659 } 1660 jujud := filepath.Join(dataDir, "tools", a.Tag().String(), jujunames.Jujud) 1661 return symlink.New(jujud, JujuRun) 1662 } 1663 1664 func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error { 1665 var errors []error 1666 agentServiceName := agentConfig.Value(agent.AgentServiceName) 1667 if agentServiceName == "" { 1668 // For backwards compatibility, handle lack of AgentServiceName. 1669 agentServiceName = os.Getenv("UPSTART_JOB") 1670 } 1671 if agentServiceName != "" { 1672 svc, err := service.DiscoverService(agentServiceName, common.Conf{}) 1673 if err != nil { 1674 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1675 } else if err := svc.Remove(); err != nil { 1676 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1677 } 1678 } 1679 1680 // Remove the juju-run symlink. 1681 if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) { 1682 errors = append(errors, err) 1683 } 1684 1685 insideLXC, err := lxcutils.RunningInsideLXC() 1686 if err != nil { 1687 errors = append(errors, err) 1688 } else if insideLXC { 1689 // We're running inside LXC, so loop devices may leak. Detach 1690 // any loop devices that are backed by files on this machine. 1691 // 1692 // It is necessary to do this here as well as in container/lxc, 1693 // as container/lxc needs to check in the container's rootfs 1694 // to see if the loop device is attached to the container; that 1695 // will fail if the data-dir is removed first. 1696 if err := a.loopDeviceManager.DetachLoopDevices("/", agentConfig.DataDir()); err != nil { 1697 errors = append(errors, err) 1698 } 1699 } 1700 1701 namespace := agentConfig.Value(agent.Namespace) 1702 if err := mongo.RemoveService(namespace); err != nil { 1703 errors = append(errors, fmt.Errorf("cannot stop/remove mongo service with namespace %q: %v", namespace, err)) 1704 } 1705 if err := os.RemoveAll(agentConfig.DataDir()); err != nil { 1706 errors = append(errors, err) 1707 } 1708 if len(errors) == 0 { 1709 return nil 1710 } 1711 return fmt.Errorf("uninstall failed: %v", errors) 1712 } 1713 1714 func newConnRunner(conns ...cmdutil.Pinger) worker.Runner { 1715 return worker.NewRunner(cmdutil.ConnectionIsFatal(logger, conns...), cmdutil.MoreImportant) 1716 } 1717 1718 type MongoSessioner interface { 1719 MongoSession() *mgo.Session 1720 } 1721 1722 func newSingularStateRunner(runner worker.Runner, st MongoSessioner, m *state.Machine) (worker.Runner, error) { 1723 singularStateConn := singularStateConn{st.MongoSession(), m} 1724 singularRunner, err := newSingularRunner(runner, singularStateConn) 1725 if err != nil { 1726 return nil, errors.Annotate(err, "cannot make singular State Runner") 1727 } 1728 return singularRunner, err 1729 } 1730 1731 // singularStateConn implements singular.Conn on 1732 // top of a State connection. 1733 type singularStateConn struct { 1734 session *mgo.Session 1735 machine *state.Machine 1736 } 1737 1738 func (c singularStateConn) IsMaster() (bool, error) { 1739 return mongo.IsMaster(c.session, c.machine) 1740 } 1741 1742 func (c singularStateConn) Ping() error { 1743 return c.session.Ping() 1744 } 1745 1746 func metricAPI(st api.Connection) metricsmanager.MetricsManagerClient { 1747 return metricsmanager.NewClient(st) 1748 } 1749 1750 // newDeployContext gives the tests the opportunity to create a deployer.Context 1751 // that can be used for testing so as to avoid (1) deploying units to the system 1752 // running the tests and (2) get access to the *State used internally, so that 1753 // tests can be run without waiting for the 5s watcher refresh time to which we would 1754 // otherwise be restricted. 1755 var newDeployContext = func(st *apideployer.State, agentConfig agent.Config) deployer.Context { 1756 return deployer.NewSimpleContext(agentConfig, st) 1757 }