github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/cmd/jujud/agent/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package agent 5 6 import ( 7 "fmt" 8 "net" 9 "os" 10 "path/filepath" 11 "runtime" 12 "strconv" 13 "sync" 14 "time" 15 16 "github.com/juju/cmd" 17 "github.com/juju/errors" 18 "github.com/juju/loggo" 19 "github.com/juju/names" 20 "github.com/juju/utils" 21 "github.com/juju/utils/featureflag" 22 "github.com/juju/utils/symlink" 23 "github.com/juju/utils/voyeur" 24 "gopkg.in/juju/charm.v4" 25 "gopkg.in/mgo.v2" 26 "launchpad.net/gnuflag" 27 "launchpad.net/tomb" 28 29 "github.com/juju/juju/agent" 30 "github.com/juju/juju/api" 31 apiagent "github.com/juju/juju/api/agent" 32 apideployer "github.com/juju/juju/api/deployer" 33 "github.com/juju/juju/api/metricsmanager" 34 "github.com/juju/juju/apiserver" 35 "github.com/juju/juju/apiserver/params" 36 "github.com/juju/juju/cmd/jujud/reboot" 37 cmdutil "github.com/juju/juju/cmd/jujud/util" 38 "github.com/juju/juju/container" 39 "github.com/juju/juju/container/kvm" 40 "github.com/juju/juju/container/lxc" 41 "github.com/juju/juju/environs" 42 "github.com/juju/juju/environs/config" 43 "github.com/juju/juju/instance" 44 jujunames "github.com/juju/juju/juju/names" 45 "github.com/juju/juju/juju/paths" 46 "github.com/juju/juju/lease" 47 "github.com/juju/juju/mongo" 48 "github.com/juju/juju/network" 49 "github.com/juju/juju/provider" 50 "github.com/juju/juju/replicaset" 51 "github.com/juju/juju/service" 52 "github.com/juju/juju/service/common" 53 "github.com/juju/juju/state" 54 "github.com/juju/juju/state/multiwatcher" 55 statestorage "github.com/juju/juju/state/storage" 56 "github.com/juju/juju/storage" 57 coretools "github.com/juju/juju/tools" 58 "github.com/juju/juju/version" 59 "github.com/juju/juju/worker" 60 "github.com/juju/juju/worker/apiaddressupdater" 61 "github.com/juju/juju/worker/authenticationworker" 62 "github.com/juju/juju/worker/certupdater" 63 "github.com/juju/juju/worker/charmrevisionworker" 64 "github.com/juju/juju/worker/cleaner" 65 "github.com/juju/juju/worker/deployer" 66 "github.com/juju/juju/worker/diskmanager" 67 "github.com/juju/juju/worker/envworkermanager" 68 "github.com/juju/juju/worker/firewaller" 69 "github.com/juju/juju/worker/instancepoller" 70 "github.com/juju/juju/worker/localstorage" 71 workerlogger "github.com/juju/juju/worker/logger" 72 "github.com/juju/juju/worker/machiner" 73 "github.com/juju/juju/worker/metricworker" 74 "github.com/juju/juju/worker/minunitsworker" 75 "github.com/juju/juju/worker/networker" 76 "github.com/juju/juju/worker/peergrouper" 77 "github.com/juju/juju/worker/provisioner" 78 "github.com/juju/juju/worker/proxyupdater" 79 rebootworker "github.com/juju/juju/worker/reboot" 80 "github.com/juju/juju/worker/resumer" 81 "github.com/juju/juju/worker/rsyslog" 82 "github.com/juju/juju/worker/singular" 83 "github.com/juju/juju/worker/terminationworker" 84 "github.com/juju/juju/worker/upgrader" 85 "gopkg.in/natefinch/lumberjack.v2" 86 ) 87 88 const bootstrapMachineId = "0" 89 90 var ( 91 logger = loggo.GetLogger("juju.cmd.jujud") 92 retryDelay = 3 * time.Second 93 JujuRun = paths.MustSucceed(paths.JujuRun(version.Current.Series)) 94 95 // The following are defined as variables to allow the tests to 96 // intercept calls to the functions. 97 useMultipleCPUs = utils.UseMultipleCPUs 98 maybeInitiateMongoServer = peergrouper.MaybeInitiateMongoServer 99 ensureMongoAdminUser = mongo.EnsureAdminUser 100 newSingularRunner = singular.New 101 peergrouperNew = peergrouper.New 102 newNetworker = networker.NewNetworker 103 newFirewaller = firewaller.NewFirewaller 104 newDiskManager = diskmanager.NewWorker 105 newCertificateUpdater = certupdater.NewCertificateUpdater 106 reportOpenedState = func(interface{}) {} 107 reportOpenedAPI = func(interface{}) {} 108 getMetricAPI = metricAPI 109 ) 110 111 func init() { 112 stateWorkerDialOpts = mongo.DefaultDialOpts() 113 stateWorkerDialOpts.PostDial = func(session *mgo.Session) error { 114 safe := mgo.Safe{ 115 // Wait for group commit if journaling is enabled, 116 // which is always true in production. 117 J: true, 118 } 119 _, err := replicaset.CurrentConfig(session) 120 if err == nil { 121 // set mongo to write-majority (writes only returned after 122 // replicated to a majority of replica-set members). 123 safe.WMode = "majority" 124 } 125 session.SetSafe(&safe) 126 return nil 127 } 128 } 129 130 // AgentInitializer handles initializing a type for use as a Jujud 131 // agent. 132 type AgentInitializer interface { 133 AddFlags(*gnuflag.FlagSet) 134 CheckArgs([]string) error 135 } 136 137 // AgentConfigWriter encapsulates disk I/O operations with the agent 138 // config. 139 type AgentConfigWriter interface { 140 // ReadConfig reads the config for the given tag from disk. 141 ReadConfig(tag string) error 142 // ChangeConfig executes the given AgentConfigMutator in a 143 // thread-safe context. 144 ChangeConfig(AgentConfigMutator) error 145 // CurrentConfig returns a copy of the in-memory agent config. 146 CurrentConfig() agent.Config 147 } 148 149 // NewMachineAgentCmd creates a Command which handles parsing 150 // command-line arguments and instantiating and running a 151 // MachineAgent. 152 func NewMachineAgentCmd( 153 machineAgentFactory func(string) *MachineAgent, 154 agentInitializer AgentInitializer, 155 configFetcher AgentConfigWriter, 156 ) cmd.Command { 157 return &machineAgentCmd{ 158 machineAgentFactory: machineAgentFactory, 159 agentInitializer: agentInitializer, 160 currentConfig: configFetcher, 161 } 162 } 163 164 type machineAgentCmd struct { 165 cmd.CommandBase 166 167 // This group of arguments is required. 168 agentInitializer AgentInitializer 169 currentConfig AgentConfigWriter 170 machineAgentFactory func(string) *MachineAgent 171 172 // This group is for debugging purposes. 173 logToStdErr bool 174 175 // The following are set via command-line flags. 176 machineId string 177 } 178 179 // Init is called by the cmd system to initialize the structure for 180 // running. 181 func (a *machineAgentCmd) Init(args []string) error { 182 183 if !names.IsValidMachine(a.machineId) { 184 return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer") 185 } 186 if err := a.agentInitializer.CheckArgs(args); err != nil { 187 return err 188 } 189 190 // Due to changes in the logging, and needing to care about old 191 // environments that have been upgraded, we need to explicitly remove the 192 // file writer if one has been added, otherwise we will get duplicate 193 // lines of all logging in the log file. 194 loggo.RemoveWriter("logfile") 195 196 if a.logToStdErr { 197 return nil 198 } 199 200 err := a.currentConfig.ReadConfig(names.NewMachineTag(a.machineId).String()) 201 if err != nil { 202 return errors.Annotate(err, "cannot read agent configuration") 203 } 204 agentConfig := a.currentConfig.CurrentConfig() 205 filename := filepath.Join(agentConfig.LogDir(), agentConfig.Tag().String()+".log") 206 207 log := &lumberjack.Logger{ 208 Filename: filename, 209 MaxSize: 300, // megabytes 210 MaxBackups: 2, 211 } 212 213 return cmdutil.SwitchProcessToRollingLogs(log) 214 } 215 216 // Run instantiates a MachineAgent and runs it. 217 func (a *machineAgentCmd) Run(c *cmd.Context) error { 218 machineAgent := a.machineAgentFactory(a.machineId) 219 return machineAgent.Run(c) 220 } 221 222 // SetFlags adds the requisite flags to run this command. 223 func (a *machineAgentCmd) SetFlags(f *gnuflag.FlagSet) { 224 a.agentInitializer.AddFlags(f) 225 f.StringVar(&a.machineId, "machine-id", "", "id of the machine to run") 226 } 227 228 // Info returns usage information for the command. 229 func (a *machineAgentCmd) Info() *cmd.Info { 230 return &cmd.Info{ 231 Name: "machine", 232 Purpose: "run a juju machine agent", 233 } 234 } 235 236 // MachineAgentFactoryFn returns a function which instantiates a 237 // MachineAgent given a machineId. 238 func MachineAgentFactoryFn( 239 agentConfWriter AgentConfigWriter, 240 apiAddressSetter apiaddressupdater.APIAddressSetter, 241 ) func(string) *MachineAgent { 242 return func(machineId string) *MachineAgent { 243 return NewMachineAgent( 244 machineId, 245 agentConfWriter, 246 apiAddressSetter, 247 NewUpgradeWorkerContext(), 248 worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant), 249 ) 250 } 251 } 252 253 // NewMachineAgent instantiates a new MachineAgent. 254 func NewMachineAgent( 255 machineId string, 256 agentConfWriter AgentConfigWriter, 257 apiAddressSetter apiaddressupdater.APIAddressSetter, 258 upgradeWorkerContext *upgradeWorkerContext, 259 runner worker.Runner, 260 ) *MachineAgent { 261 262 return &MachineAgent{ 263 machineId: machineId, 264 AgentConfigWriter: agentConfWriter, 265 apiAddressSetter: apiAddressSetter, 266 workersStarted: make(chan struct{}), 267 upgradeWorkerContext: upgradeWorkerContext, 268 runner: runner, 269 } 270 } 271 272 // MachineAgent is responsible for tying together all functionality 273 // needed to orchestarte a Jujud instance which controls a machine. 274 type MachineAgent struct { 275 AgentConfigWriter 276 277 tomb tomb.Tomb 278 machineId string 279 previousAgentVersion version.Number 280 apiAddressSetter apiaddressupdater.APIAddressSetter 281 runner worker.Runner 282 configChangedVal voyeur.Value 283 upgradeWorkerContext *upgradeWorkerContext 284 restoreMode bool 285 restoring bool 286 workersStarted chan struct{} 287 288 mongoInitMutex sync.Mutex 289 mongoInitialized bool 290 } 291 292 // IsRestorePreparing returns bool representing if we are in restore mode 293 // but not running restore. 294 func (a *MachineAgent) IsRestorePreparing() bool { 295 return a.restoreMode && !a.restoring 296 } 297 298 // IsRestoreRunning returns bool representing if we are in restore mode 299 // and running the actual restore process. 300 func (a *MachineAgent) IsRestoreRunning() bool { 301 return a.restoring 302 } 303 304 // Wait waits for the machine agent to finish. 305 func (a *MachineAgent) Wait() error { 306 return a.tomb.Wait() 307 } 308 309 // Stop stops the machine agent. 310 func (a *MachineAgent) Stop() error { 311 a.runner.Kill() 312 return a.tomb.Wait() 313 } 314 315 // Dying returns the channel that can be used to see if the machine 316 // agent is terminating. 317 func (a *MachineAgent) Dying() <-chan struct{} { 318 return a.tomb.Dying() 319 } 320 321 // Run runs a machine agent. 322 func (a *MachineAgent) Run(*cmd.Context) error { 323 324 defer a.tomb.Done() 325 if err := a.ReadConfig(a.Tag().String()); err != nil { 326 return fmt.Errorf("cannot read agent configuration: %v", err) 327 } 328 agentConfig := a.CurrentConfig() 329 330 logger.Infof("machine agent %v start (%s [%s])", a.Tag(), version.Current, runtime.Compiler) 331 if flags := featureflag.String(); flags != "" { 332 logger.Warningf("developer feature flags enabled: %s", flags) 333 } 334 335 if err := a.upgradeWorkerContext.InitializeUsingAgent(a); err != nil { 336 return errors.Annotate(err, "error during upgradeWorkerContext initialisation") 337 } 338 a.configChangedVal.Set(struct{}{}) 339 a.previousAgentVersion = agentConfig.UpgradedToVersion() 340 network.InitializeFromConfig(agentConfig) 341 charm.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache") 342 if err := a.createJujuRun(agentConfig.DataDir()); err != nil { 343 return fmt.Errorf("cannot create juju run symlink: %v", err) 344 } 345 a.runner.StartWorker("api", a.APIWorker) 346 a.runner.StartWorker("statestarter", a.newStateStarterWorker) 347 a.runner.StartWorker("termination", func() (worker.Worker, error) { 348 return terminationworker.NewWorker(), nil 349 }) 350 // At this point, all workers will have been configured to start 351 close(a.workersStarted) 352 err := a.runner.Wait() 353 switch err { 354 case worker.ErrTerminateAgent: 355 err = a.uninstallAgent(agentConfig) 356 case worker.ErrRebootMachine: 357 logger.Infof("Caught reboot error") 358 err = a.executeRebootOrShutdown(params.ShouldReboot) 359 case worker.ErrShutdownMachine: 360 logger.Infof("Caught shutdown error") 361 err = a.executeRebootOrShutdown(params.ShouldShutdown) 362 } 363 err = cmdutil.AgentDone(logger, err) 364 a.tomb.Kill(err) 365 return err 366 } 367 368 func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error { 369 agentCfg := a.CurrentConfig() 370 // At this stage, all API connections would have been closed 371 // We need to reopen the API to clear the reboot flag after 372 // scheduling the reboot. It may be cleaner to do this in the reboot 373 // worker, before returning the ErrRebootMachine. 374 st, _, err := OpenAPIState(agentCfg, a) 375 if err != nil { 376 logger.Infof("Reboot: Error connecting to state") 377 return errors.Trace(err) 378 } 379 // block until all units/containers are ready, and reboot/shutdown 380 finalize, err := reboot.NewRebootWaiter(st, agentCfg) 381 if err != nil { 382 return errors.Trace(err) 383 } 384 385 logger.Infof("Reboot: Executing reboot") 386 err = finalize.ExecuteReboot(action) 387 if err != nil { 388 logger.Infof("Reboot: Error executing reboot: %v", err) 389 return errors.Trace(err) 390 } 391 // On windows, the shutdown command is asynchronous. We return ErrRebootMachine 392 // so the agent will simply exit without error pending reboot/shutdown. 393 return worker.ErrRebootMachine 394 } 395 396 func (a *MachineAgent) ChangeConfig(mutate AgentConfigMutator) error { 397 err := a.AgentConfigWriter.ChangeConfig(mutate) 398 a.configChangedVal.Set(struct{}{}) 399 if err != nil { 400 return errors.Trace(err) 401 } 402 return nil 403 } 404 405 // PrepareRestore will flag the agent to allow only a limited set 406 // of commands defined in 407 // "github.com/juju/juju/apiserver".allowedMethodsAboutToRestore 408 // the most noteworthy is: 409 // Backups.Restore: this will ensure that we can do all the file movements 410 // required for restore and no one will do changes while we do that. 411 // it will return error if the machine is already in this state. 412 func (a *MachineAgent) PrepareRestore() error { 413 if a.restoreMode { 414 return errors.Errorf("already in restore mode") 415 } 416 a.restoreMode = true 417 return nil 418 } 419 420 // BeginRestore will flag the agent to disallow all commands since 421 // restore should be running and therefore making changes that 422 // would override anything done. 423 func (a *MachineAgent) BeginRestore() error { 424 switch { 425 case !a.restoreMode: 426 return errors.Errorf("not in restore mode, cannot begin restoration") 427 case a.restoring: 428 return errors.Errorf("already restoring") 429 } 430 a.restoring = true 431 return nil 432 } 433 434 // newrestorestatewatcherworker will return a worker or err if there is a failure, 435 // the worker takes care of watching the state of restoreInfo doc and put the 436 // agent in the different restore modes. 437 func (a *MachineAgent) newRestoreStateWatcherWorker(st *state.State) (worker.Worker, error) { 438 rWorker := func(stopch <-chan struct{}) error { 439 return a.restoreStateWatcher(st, stopch) 440 } 441 return worker.NewSimpleWorker(rWorker), nil 442 } 443 444 // restoreChanged will be called whenever restoreInfo doc changes signaling a new 445 // step in the restore process. 446 func (a *MachineAgent) restoreChanged(st *state.State) error { 447 rinfo, err := st.EnsureRestoreInfo() 448 if err != nil { 449 return errors.Annotate(err, "cannot read restore state") 450 } 451 switch rinfo.Status() { 452 case state.RestorePending: 453 a.PrepareRestore() 454 case state.RestoreInProgress: 455 a.BeginRestore() 456 } 457 return nil 458 } 459 460 // restoreStateWatcher watches for restoreInfo looking for changes in the restore process. 461 func (a *MachineAgent) restoreStateWatcher(st *state.State, stopch <-chan struct{}) error { 462 restoreWatch := st.WatchRestoreInfoChanges() 463 defer func() { 464 restoreWatch.Kill() 465 restoreWatch.Wait() 466 }() 467 468 for { 469 select { 470 case <-restoreWatch.Changes(): 471 if err := a.restoreChanged(st); err != nil { 472 return err 473 } 474 case <-stopch: 475 return nil 476 } 477 } 478 } 479 480 // newStateStarterWorker wraps stateStarter in a simple worker for use in 481 // a.runner.StartWorker. 482 func (a *MachineAgent) newStateStarterWorker() (worker.Worker, error) { 483 return worker.NewSimpleWorker(a.stateStarter), nil 484 } 485 486 // stateStarter watches for changes to the agent configuration, and 487 // starts or stops the state worker as appropriate. We watch the agent 488 // configuration because the agent configuration has all the details 489 // that we need to start a state server, whether they have been cached 490 // or read from the state. 491 // 492 // It will stop working as soon as stopch is closed. 493 func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error { 494 confWatch := a.configChangedVal.Watch() 495 defer confWatch.Close() 496 watchCh := make(chan struct{}) 497 go func() { 498 for confWatch.Next() { 499 watchCh <- struct{}{} 500 } 501 }() 502 for { 503 select { 504 case <-watchCh: 505 agentConfig := a.CurrentConfig() 506 507 // N.B. StartWorker and StopWorker are idempotent. 508 _, ok := agentConfig.StateServingInfo() 509 if ok { 510 a.runner.StartWorker("state", func() (worker.Worker, error) { 511 return a.StateWorker() 512 }) 513 } else { 514 a.runner.StopWorker("state") 515 } 516 case <-stopch: 517 return nil 518 } 519 } 520 } 521 522 // APIWorker returns a Worker that connects to the API and starts any 523 // workers that need an API connection. 524 func (a *MachineAgent) APIWorker() (worker.Worker, error) { 525 agentConfig := a.CurrentConfig() 526 st, entity, err := OpenAPIState(agentConfig, a) 527 if err != nil { 528 return nil, err 529 } 530 reportOpenedAPI(st) 531 532 // Refresh the configuration, since it may have been updated after opening state. 533 agentConfig = a.CurrentConfig() 534 for _, job := range entity.Jobs() { 535 if job.NeedsState() { 536 info, err := st.Agent().StateServingInfo() 537 if err != nil { 538 return nil, fmt.Errorf("cannot get state serving info: %v", err) 539 } 540 err = a.ChangeConfig(func(config agent.ConfigSetter) error { 541 config.SetStateServingInfo(info) 542 return nil 543 }) 544 if err != nil { 545 return nil, err 546 } 547 agentConfig = a.CurrentConfig() 548 break 549 } 550 } 551 552 // Before starting any workers, ensure we record the Juju version this machine 553 // agent is running. 554 currentTools := &coretools.Tools{Version: version.Current} 555 if err := st.Upgrader().SetVersion(agentConfig.Tag().String(), currentTools.Version); err != nil { 556 return nil, errors.Annotate(err, "cannot set machine agent version") 557 } 558 559 runner := newConnRunner(st) 560 561 // Run the upgrader and the upgrade-steps worker without waiting for 562 // the upgrade steps to complete. 563 runner.StartWorker("upgrader", func() (worker.Worker, error) { 564 return upgrader.NewUpgrader( 565 st.Upgrader(), 566 agentConfig, 567 a.previousAgentVersion, 568 a.upgradeWorkerContext.IsUpgradeRunning, 569 ), nil 570 }) 571 runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter(st, entity.Jobs())) 572 573 // All other workers must wait for the upgrade steps to complete before starting. 574 a.startWorkerAfterUpgrade(runner, "api-post-upgrade", func() (worker.Worker, error) { 575 return a.postUpgradeAPIWorker(st, agentConfig, entity) 576 }) 577 578 return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 579 } 580 581 func (a *MachineAgent) postUpgradeAPIWorker( 582 st *api.State, 583 agentConfig agent.Config, 584 entity *apiagent.Entity, 585 ) (worker.Worker, error) { 586 587 rsyslogMode := rsyslog.RsyslogModeForwarding 588 var err error 589 for _, job := range entity.Jobs() { 590 if job == multiwatcher.JobManageEnviron { 591 rsyslogMode = rsyslog.RsyslogModeAccumulate 592 break 593 } 594 } 595 596 runner := newConnRunner(st) 597 // TODO(fwereade): this is *still* a hideous layering violation, but at least 598 // it's confined to jujud rather than extending into the worker itself. 599 // Start this worker first to try and get proxy settings in place 600 // before we do anything else. 601 writeSystemFiles := shouldWriteProxyFiles(agentConfig) 602 runner.StartWorker("proxyupdater", func() (worker.Worker, error) { 603 return proxyupdater.New(st.Environment(), writeSystemFiles), nil 604 }) 605 606 runner.StartWorker("machiner", func() (worker.Worker, error) { 607 return machiner.NewMachiner(st.Machiner(), agentConfig), nil 608 }) 609 runner.StartWorker("reboot", func() (worker.Worker, error) { 610 reboot, err := st.Reboot() 611 if err != nil { 612 return nil, errors.Trace(err) 613 } 614 lock, err := cmdutil.HookExecutionLock(cmdutil.DataDir) 615 if err != nil { 616 return nil, errors.Trace(err) 617 } 618 return rebootworker.NewReboot(reboot, agentConfig, lock) 619 }) 620 runner.StartWorker("apiaddressupdater", func() (worker.Worker, error) { 621 return apiaddressupdater.NewAPIAddressUpdater(st.Machiner(), a.apiAddressSetter), nil 622 }) 623 runner.StartWorker("logger", func() (worker.Worker, error) { 624 return workerlogger.NewLogger(st.Logger(), agentConfig), nil 625 }) 626 627 runner.StartWorker("rsyslog", func() (worker.Worker, error) { 628 return cmdutil.NewRsyslogConfigWorker(st.Rsyslog(), agentConfig, rsyslogMode) 629 }) 630 // TODO(axw) stop checking feature flag once storage has graduated. 631 if featureflag.Enabled(storage.FeatureFlag) { 632 runner.StartWorker("diskmanager", func() (worker.Worker, error) { 633 api, err := st.DiskManager() 634 if err != nil { 635 return nil, errors.Trace(err) 636 } 637 return newDiskManager(diskmanager.DefaultListBlockDevices, api), nil 638 }) 639 } 640 641 // Check if the network management is disabled. 642 envConfig, err := st.Environment().EnvironConfig() 643 if err != nil { 644 return nil, fmt.Errorf("cannot read environment config: %v", err) 645 } 646 disableNetworkManagement, _ := envConfig.DisableNetworkManagement() 647 if disableNetworkManagement { 648 logger.Infof("network management is disabled") 649 } 650 651 // Start networker depending on configuration and job. 652 intrusiveMode := false 653 for _, job := range entity.Jobs() { 654 if job == multiwatcher.JobManageNetworking { 655 intrusiveMode = true 656 break 657 } 658 } 659 intrusiveMode = intrusiveMode && !disableNetworkManagement 660 runner.StartWorker("networker", func() (worker.Worker, error) { 661 return newNetworker(st.Networker(), agentConfig, intrusiveMode, networker.DefaultConfigBaseDir) 662 }) 663 664 // If not a local provider bootstrap machine, start the worker to 665 // manage SSH keys. 666 providerType := agentConfig.Value(agent.ProviderType) 667 if providerType != provider.Local || a.machineId != bootstrapMachineId { 668 runner.StartWorker("authenticationworker", func() (worker.Worker, error) { 669 return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil 670 }) 671 } 672 673 // Perform the operations needed to set up hosting for containers. 674 if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil { 675 cause := errors.Cause(err) 676 if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent { 677 return nil, worker.ErrTerminateAgent 678 } 679 return nil, fmt.Errorf("setting up container support: %v", err) 680 } 681 for _, job := range entity.Jobs() { 682 switch job { 683 case multiwatcher.JobHostUnits: 684 runner.StartWorker("deployer", func() (worker.Worker, error) { 685 apiDeployer := st.Deployer() 686 context := newDeployContext(apiDeployer, agentConfig) 687 return deployer.NewDeployer(apiDeployer, context), nil 688 }) 689 case multiwatcher.JobManageEnviron: 690 runner.StartWorker("identity-file-writer", func() (worker.Worker, error) { 691 inner := func(<-chan struct{}) error { 692 agentConfig := a.CurrentConfig() 693 return agent.WriteSystemIdentityFile(agentConfig) 694 } 695 return worker.NewSimpleWorker(inner), nil 696 }) 697 case multiwatcher.JobManageStateDeprecated: 698 // Legacy environments may set this, but we ignore it. 699 default: 700 // TODO(dimitern): Once all workers moved over to using 701 // the API, report "unknown job type" here. 702 } 703 } 704 705 return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 706 } 707 708 func (a *MachineAgent) upgradeStepsWorkerStarter( 709 st *api.State, 710 jobs []multiwatcher.MachineJob, 711 ) func() (worker.Worker, error) { 712 return func() (worker.Worker, error) { 713 return a.upgradeWorkerContext.Worker(a, st, jobs), nil 714 } 715 } 716 717 // shouldWriteProxyFiles returns true, unless the supplied conf identifies the 718 // machine agent running directly on the host system in a local environment. 719 var shouldWriteProxyFiles = func(conf agent.Config) bool { 720 if conf.Value(agent.ProviderType) != provider.Local { 721 return true 722 } 723 return conf.Tag() != names.NewMachineTag(bootstrapMachineId) 724 } 725 726 // setupContainerSupport determines what containers can be run on this machine and 727 // initialises suitable infrastructure to support such containers. 728 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity, agentConfig agent.Config) error { 729 var supportedContainers []instance.ContainerType 730 // LXC containers are only supported on bare metal and fully virtualized linux systems 731 // Nested LXC containers and Windows machines cannot run LXC containers 732 supportsLXC, err := lxc.IsLXCSupported() 733 if err != nil { 734 logger.Warningf("no lxc containers possible: %v", err) 735 } 736 if err == nil && supportsLXC { 737 supportedContainers = append(supportedContainers, instance.LXC) 738 } 739 740 supportsKvm, err := kvm.IsKVMSupported() 741 if err != nil { 742 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 743 } 744 if err == nil && supportsKvm { 745 supportedContainers = append(supportedContainers, instance.KVM) 746 } 747 return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig) 748 } 749 750 // updateSupportedContainers records in state that a machine can run the specified containers. 751 // It starts a watcher and when a container of a given type is first added to the machine, 752 // the watcher is killed, the machine is set up to be able to start containers of the given type, 753 // and a suitable provisioner is started. 754 func (a *MachineAgent) updateSupportedContainers( 755 runner worker.Runner, 756 st *api.State, 757 machineTag string, 758 containers []instance.ContainerType, 759 agentConfig agent.Config, 760 ) error { 761 pr := st.Provisioner() 762 tag, err := names.ParseMachineTag(machineTag) 763 if err != nil { 764 return err 765 } 766 machine, err := pr.Machine(tag) 767 if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead { 768 return worker.ErrTerminateAgent 769 } 770 if err != nil { 771 return errors.Annotatef(err, "cannot load machine %s from state", tag) 772 } 773 if len(containers) == 0 { 774 if err := machine.SupportsNoContainers(); err != nil { 775 return errors.Annotatef(err, "clearing supported containers for %s", tag) 776 } 777 return nil 778 } 779 if err := machine.SetSupportedContainers(containers...); err != nil { 780 return errors.Annotatef(err, "setting supported containers for %s", tag) 781 } 782 initLock, err := cmdutil.HookExecutionLock(agentConfig.DataDir()) 783 if err != nil { 784 return err 785 } 786 // Start the watcher to fire when a container is first requested on the machine. 787 envUUID, err := st.EnvironTag() 788 if err != nil { 789 return err 790 } 791 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 792 // There may not be a CA certificate private key available, and without 793 // it we can't ensure that other Juju nodes can connect securely, so only 794 // use an image URL getter if there's a private key. 795 var imageURLGetter container.ImageURLGetter 796 if agentConfig.Value(agent.AllowsSecureConnection) == "true" { 797 imageURLGetter = container.NewImageURLGetter(st.Addr(), envUUID.Id(), []byte(agentConfig.CACert())) 798 } 799 params := provisioner.ContainerSetupParams{ 800 Runner: runner, 801 WorkerName: watcherName, 802 SupportedContainers: containers, 803 ImageURLGetter: imageURLGetter, 804 Machine: machine, 805 Provisioner: pr, 806 Config: agentConfig, 807 InitLock: initLock, 808 } 809 handler := provisioner.NewContainerSetupHandler(params) 810 a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) { 811 return worker.NewStringsWorker(handler), nil 812 }) 813 return nil 814 } 815 816 // StateWorker returns a worker running all the workers that require 817 // a *state.State connection. 818 func (a *MachineAgent) StateWorker() (worker.Worker, error) { 819 agentConfig := a.CurrentConfig() 820 821 // Start MongoDB server and dial. 822 if err := a.ensureMongoServer(agentConfig); err != nil { 823 return nil, err 824 } 825 st, m, err := openState(agentConfig, stateWorkerDialOpts) 826 if err != nil { 827 return nil, err 828 } 829 reportOpenedState(st) 830 831 stor := statestorage.NewStorage(st.EnvironUUID(), st.MongoSession()) 832 registerSimplestreamsDataSource(stor) 833 834 runner := newConnRunner(st) 835 singularRunner, err := newSingularStateRunner(runner, st, m) 836 if err != nil { 837 return nil, errors.Trace(err) 838 } 839 840 // Take advantage of special knowledge here in that we will only ever want 841 // the storage provider on one machine, and that is the "bootstrap" node. 842 providerType := agentConfig.Value(agent.ProviderType) 843 if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId { 844 a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) { 845 // TODO(axw) 2013-09-24 bug #1229507 846 // Make another job to enable storage. 847 // There's nothing special about this. 848 return localstorage.NewWorker(agentConfig), nil 849 }) 850 } 851 for _, job := range m.Jobs() { 852 switch job { 853 case state.JobHostUnits: 854 // Implemented in APIWorker. 855 case state.JobManageEnviron: 856 useMultipleCPUs() 857 a.startWorkerAfterUpgrade(runner, "env worker manager", func() (worker.Worker, error) { 858 return envworkermanager.NewEnvWorkerManager(st, a.startEnvWorkers), nil 859 }) 860 a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) { 861 return peergrouperNew(st) 862 }) 863 a.startWorkerAfterUpgrade(runner, "restore", func() (worker.Worker, error) { 864 return a.newRestoreStateWatcherWorker(st) 865 }) 866 a.startWorkerAfterUpgrade(runner, "lease manager", func() (worker.Worker, error) { 867 workerLoop := lease.WorkerLoop(st) 868 return worker.NewSimpleWorker(workerLoop), nil 869 }) 870 certChangedChan := make(chan params.StateServingInfo, 1) 871 runner.StartWorker("apiserver", a.apiserverWorkerStarter(st, certChangedChan)) 872 var stateServingSetter certupdater.StateServingInfoSetter = func(info params.StateServingInfo) error { 873 return a.ChangeConfig(func(config agent.ConfigSetter) error { 874 config.SetStateServingInfo(info) 875 logger.Infof("update apiserver worker with new certificate") 876 certChangedChan <- info 877 return nil 878 }) 879 } 880 a.startWorkerAfterUpgrade(runner, "certupdater", func() (worker.Worker, error) { 881 return newCertificateUpdater(m, agentConfig, st, stateServingSetter, certChangedChan), nil 882 }) 883 a.startWorkerAfterUpgrade(singularRunner, "resumer", func() (worker.Worker, error) { 884 // The action of resumer is so subtle that it is not tested, 885 // because we can't figure out how to do so without brutalising 886 // the transaction log. 887 return resumer.NewResumer(st), nil 888 }) 889 case state.JobManageStateDeprecated: 890 // Legacy environments may set this, but we ignore it. 891 default: 892 logger.Warningf("ignoring unknown job %q", job) 893 } 894 } 895 return cmdutil.NewCloseWorker(logger, runner, st), nil 896 } 897 898 // startEnvWorkers starts state server workers that need to run per 899 // environment. 900 func (a *MachineAgent) startEnvWorkers( 901 ssSt envworkermanager.InitialState, 902 st *state.State, 903 ) (runner worker.Runner, err error) { 904 envUUID := st.EnvironUUID() 905 defer errors.DeferredAnnotatef(&err, "failed to start workers for env %s", envUUID) 906 logger.Infof("starting workers for env %s", envUUID) 907 908 // Establish API connection for this environment. 909 agentConfig := a.CurrentConfig() 910 apiInfo := agentConfig.APIInfo() 911 apiInfo.EnvironTag = st.EnvironTag() 912 apiSt, err := OpenAPIStateUsingInfo(apiInfo, a, agentConfig.OldPassword()) 913 if err != nil { 914 return nil, errors.Trace(err) 915 } 916 917 // Create a runner for workers specific to this 918 // environment. Either the State or API connection failing will be 919 // considered fatal, killing the runner and all its workers. 920 runner = newConnRunner(st, apiSt) 921 defer func() { 922 if err != nil && runner != nil { 923 runner.Kill() 924 runner.Wait() 925 } 926 }() 927 // Close the API connection when the runner for this environment dies. 928 go func() { 929 runner.Wait() 930 err := apiSt.Close() 931 if err != nil { 932 logger.Errorf("failed to close API connection for env %s: %v", envUUID, err) 933 } 934 }() 935 936 // Create a singular runner for this environment. 937 machine, err := ssSt.Machine(a.machineId) 938 if err != nil { 939 return nil, errors.Trace(err) 940 } 941 singularRunner, err := newSingularStateRunner(runner, ssSt, machine) 942 if err != nil { 943 return nil, errors.Trace(err) 944 } 945 defer func() { 946 if err != nil && singularRunner != nil { 947 singularRunner.Kill() 948 singularRunner.Wait() 949 } 950 }() 951 952 // Start workers that depend on a *state.State. 953 runner.StartWorker("instancepoller", func() (worker.Worker, error) { 954 return instancepoller.NewWorker(st), nil 955 }) 956 singularRunner.StartWorker("cleaner", func() (worker.Worker, error) { 957 return cleaner.NewCleaner(st), nil 958 }) 959 singularRunner.StartWorker("minunitsworker", func() (worker.Worker, error) { 960 return minunitsworker.NewMinUnitsWorker(st), nil 961 }) 962 963 // Start workers that use an API connection. 964 singularRunner.StartWorker("environ-provisioner", func() (worker.Worker, error) { 965 return provisioner.NewEnvironProvisioner(apiSt.Provisioner(), agentConfig), nil 966 }) 967 singularRunner.StartWorker("charm-revision-updater", func() (worker.Worker, error) { 968 return charmrevisionworker.NewRevisionUpdateWorker(apiSt.CharmRevisionUpdater()), nil 969 }) 970 runner.StartWorker("metricmanagerworker", func() (worker.Worker, error) { 971 return metricworker.NewMetricsManager(getMetricAPI(apiSt)) 972 }) 973 974 // TODO(axw) 2013-09-24 bug #1229506 975 // Make another job to enable the firewaller. Not all 976 // environments are capable of managing ports 977 // centrally. 978 fwMode, err := getFirewallMode(apiSt) 979 if err != nil { 980 return nil, errors.Annotate(err, "cannot get firewall mode") 981 } 982 if fwMode != config.FwNone { 983 singularRunner.StartWorker("firewaller", func() (worker.Worker, error) { 984 return newFirewaller(apiSt.Firewaller()) 985 }) 986 } else { 987 logger.Debugf("not starting firewaller worker - firewall-mode is %q", fwMode) 988 } 989 990 return runner, nil 991 } 992 993 var getFirewallMode = _getFirewallMode 994 995 func _getFirewallMode(apiSt *api.State) (string, error) { 996 envConfig, err := apiSt.Environment().EnvironConfig() 997 if err != nil { 998 return "", errors.Annotate(err, "cannot read environment config") 999 } 1000 return envConfig.FirewallMode(), nil 1001 } 1002 1003 // stateWorkerDialOpts is a mongo.DialOpts suitable 1004 // for use by StateWorker to dial mongo. 1005 // 1006 // This must be overridden in tests, as it assumes 1007 // journaling is enabled. 1008 var stateWorkerDialOpts mongo.DialOpts 1009 1010 func (a *MachineAgent) apiserverWorkerStarter(st *state.State, certChanged chan params.StateServingInfo) func() (worker.Worker, error) { 1011 return func() (worker.Worker, error) { return a.newApiserverWorker(st, certChanged) } 1012 } 1013 1014 func (a *MachineAgent) newApiserverWorker(st *state.State, certChanged chan params.StateServingInfo) (worker.Worker, error) { 1015 agentConfig := a.CurrentConfig() 1016 // If the configuration does not have the required information, 1017 // it is currently not a recoverable error, so we kill the whole 1018 // agent, potentially enabling human intervention to fix 1019 // the agent's configuration file. 1020 info, ok := agentConfig.StateServingInfo() 1021 if !ok { 1022 return nil, &cmdutil.FatalError{"StateServingInfo not available and we need it"} 1023 } 1024 cert := []byte(info.Cert) 1025 key := []byte(info.PrivateKey) 1026 1027 if len(cert) == 0 || len(key) == 0 { 1028 return nil, &cmdutil.FatalError{"configuration does not have state server cert/key"} 1029 } 1030 tag := agentConfig.Tag() 1031 dataDir := agentConfig.DataDir() 1032 logDir := agentConfig.LogDir() 1033 1034 endpoint := net.JoinHostPort("", strconv.Itoa(info.APIPort)) 1035 listener, err := net.Listen("tcp", endpoint) 1036 if err != nil { 1037 return nil, err 1038 } 1039 return apiserver.NewServer(st, listener, apiserver.ServerConfig{ 1040 Cert: cert, 1041 Key: key, 1042 Tag: tag, 1043 DataDir: dataDir, 1044 LogDir: logDir, 1045 Validator: a.limitLogins, 1046 CertChanged: certChanged, 1047 }) 1048 } 1049 1050 // limitLogins is called by the API server for each login attempt. 1051 // it returns an error if upgrads or restore are running. 1052 func (a *MachineAgent) limitLogins(req params.LoginRequest) error { 1053 if err := a.limitLoginsDuringRestore(req); err != nil { 1054 return err 1055 } 1056 return a.limitLoginsDuringUpgrade(req) 1057 } 1058 1059 // limitLoginsDuringRestore will only allow logins for restore related purposes 1060 // while the different steps of restore are running. 1061 func (a *MachineAgent) limitLoginsDuringRestore(req params.LoginRequest) error { 1062 var err error 1063 switch { 1064 case a.IsRestoreRunning(): 1065 err = apiserver.RestoreInProgressError 1066 case a.IsRestorePreparing(): 1067 err = apiserver.AboutToRestoreError 1068 } 1069 if err != nil { 1070 authTag, parseErr := names.ParseTag(req.AuthTag) 1071 if parseErr != nil { 1072 return errors.Annotate(err, "could not parse auth tag") 1073 } 1074 switch authTag := authTag.(type) { 1075 case names.UserTag: 1076 // use a restricted API mode 1077 return err 1078 case names.MachineTag: 1079 if authTag == a.Tag() { 1080 // allow logins from the local machine 1081 return nil 1082 } 1083 } 1084 return errors.Errorf("login for %q blocked because restore is in progress", authTag) 1085 } 1086 return nil 1087 } 1088 1089 // limitLoginsDuringUpgrade is called by the API server for each login 1090 // attempt. It returns an error if upgrades are in progress unless the 1091 // login is for a user (i.e. a client) or the local machine. 1092 func (a *MachineAgent) limitLoginsDuringUpgrade(req params.LoginRequest) error { 1093 if a.upgradeWorkerContext.IsUpgradeRunning() { 1094 authTag, err := names.ParseTag(req.AuthTag) 1095 if err != nil { 1096 return errors.Annotate(err, "could not parse auth tag") 1097 } 1098 switch authTag := authTag.(type) { 1099 case names.UserTag: 1100 // use a restricted API mode 1101 return apiserver.UpgradeInProgressError 1102 case names.MachineTag: 1103 if authTag == a.Tag() { 1104 // allow logins from the local machine 1105 return nil 1106 } 1107 } 1108 return errors.Errorf("login for %q blocked because upgrade is in progress", authTag) 1109 } else { 1110 return nil // allow all logins 1111 } 1112 } 1113 1114 // ensureMongoServer ensures that mongo is installed and running, 1115 // and ready for opening a state connection. 1116 func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) (err error) { 1117 a.mongoInitMutex.Lock() 1118 defer a.mongoInitMutex.Unlock() 1119 if a.mongoInitialized { 1120 logger.Debugf("mongo is already initialized") 1121 return nil 1122 } 1123 defer func() { 1124 if err == nil { 1125 a.mongoInitialized = true 1126 } 1127 }() 1128 1129 servingInfo, ok := agentConfig.StateServingInfo() 1130 if !ok { 1131 return fmt.Errorf("state worker was started with no state serving info") 1132 } 1133 1134 // When upgrading from a pre-HA-capable environment, 1135 // we must add machine-0 to the admin database and 1136 // initiate its replicaset. 1137 // 1138 // TODO(axw) remove this when we no longer need 1139 // to upgrade from pre-HA-capable environments. 1140 var shouldInitiateMongoServer bool 1141 var addrs []network.Address 1142 if isPreHAVersion(a.previousAgentVersion) { 1143 _, err := a.ensureMongoAdminUser(agentConfig) 1144 if err != nil { 1145 return err 1146 } 1147 if servingInfo.SharedSecret == "" { 1148 servingInfo.SharedSecret, err = mongo.GenerateSharedSecret() 1149 if err != nil { 1150 return err 1151 } 1152 if err = a.ChangeConfig(func(config agent.ConfigSetter) error { 1153 config.SetStateServingInfo(servingInfo) 1154 return nil 1155 }); err != nil { 1156 return err 1157 } 1158 agentConfig = a.CurrentConfig() 1159 } 1160 // Note: we set Direct=true in the mongo options because it's 1161 // possible that we've previously upgraded the mongo server's 1162 // configuration to form a replicaset, but failed to initiate it. 1163 st, m, err := openState(agentConfig, mongo.DialOpts{Direct: true}) 1164 if err != nil { 1165 return err 1166 } 1167 ssi := cmdutil.ParamsStateServingInfoToStateStateServingInfo(servingInfo) 1168 if err := st.SetStateServingInfo(ssi); err != nil { 1169 st.Close() 1170 return fmt.Errorf("cannot set state serving info: %v", err) 1171 } 1172 st.Close() 1173 addrs = m.Addresses() 1174 shouldInitiateMongoServer = true 1175 } 1176 1177 // ensureMongoServer installs/upgrades the init config as necessary. 1178 ensureServerParams, err := cmdutil.NewEnsureServerParams(agentConfig) 1179 if err != nil { 1180 return err 1181 } 1182 if err := cmdutil.EnsureMongoServer(ensureServerParams); err != nil { 1183 return err 1184 } 1185 if !shouldInitiateMongoServer { 1186 return nil 1187 } 1188 1189 // Initiate the replicaset for upgraded environments. 1190 // 1191 // TODO(axw) remove this when we no longer need 1192 // to upgrade from pre-HA-capable environments. 1193 stateInfo, ok := agentConfig.MongoInfo() 1194 if !ok { 1195 return fmt.Errorf("state worker was started with no state serving info") 1196 } 1197 dialInfo, err := mongo.DialInfo(stateInfo.Info, mongo.DefaultDialOpts()) 1198 if err != nil { 1199 return err 1200 } 1201 peerAddr := mongo.SelectPeerAddress(addrs) 1202 if peerAddr == "" { 1203 return fmt.Errorf("no appropriate peer address found in %q", addrs) 1204 } 1205 if err := maybeInitiateMongoServer(peergrouper.InitiateMongoParams{ 1206 DialInfo: dialInfo, 1207 MemberHostPort: net.JoinHostPort(peerAddr, fmt.Sprint(servingInfo.StatePort)), 1208 // TODO(dfc) InitiateMongoParams should take a Tag 1209 User: stateInfo.Tag.String(), 1210 Password: stateInfo.Password, 1211 }); err != nil && err != peergrouper.ErrReplicaSetAlreadyInitiated { 1212 return err 1213 } 1214 return nil 1215 } 1216 1217 func (a *MachineAgent) ensureMongoAdminUser(agentConfig agent.Config) (added bool, err error) { 1218 stateInfo, ok1 := agentConfig.MongoInfo() 1219 servingInfo, ok2 := agentConfig.StateServingInfo() 1220 if !ok1 || !ok2 { 1221 return false, fmt.Errorf("no state serving info configuration") 1222 } 1223 dialInfo, err := mongo.DialInfo(stateInfo.Info, mongo.DefaultDialOpts()) 1224 if err != nil { 1225 return false, err 1226 } 1227 if len(dialInfo.Addrs) > 1 { 1228 logger.Infof("more than one state server; admin user must exist") 1229 return false, nil 1230 } 1231 return ensureMongoAdminUser(mongo.EnsureAdminUserParams{ 1232 DialInfo: dialInfo, 1233 Namespace: agentConfig.Value(agent.Namespace), 1234 DataDir: agentConfig.DataDir(), 1235 Port: servingInfo.StatePort, 1236 User: stateInfo.Tag.String(), 1237 Password: stateInfo.Password, 1238 }) 1239 } 1240 1241 func isPreHAVersion(v version.Number) bool { 1242 return v.Compare(version.MustParse("1.19.0")) < 0 1243 } 1244 1245 func openState(agentConfig agent.Config, dialOpts mongo.DialOpts) (_ *state.State, _ *state.Machine, err error) { 1246 info, ok := agentConfig.MongoInfo() 1247 if !ok { 1248 return nil, nil, fmt.Errorf("no state info available") 1249 } 1250 st, err := state.Open(info, dialOpts, environs.NewStatePolicy()) 1251 if err != nil { 1252 return nil, nil, err 1253 } 1254 defer func() { 1255 if err != nil { 1256 st.Close() 1257 } 1258 }() 1259 m0, err := st.FindEntity(agentConfig.Tag()) 1260 if err != nil { 1261 if errors.IsNotFound(err) { 1262 err = worker.ErrTerminateAgent 1263 } 1264 return nil, nil, err 1265 } 1266 m := m0.(*state.Machine) 1267 if m.Life() == state.Dead { 1268 return nil, nil, worker.ErrTerminateAgent 1269 } 1270 // Check the machine nonce as provisioned matches the agent.Conf value. 1271 if !m.CheckProvisioned(agentConfig.Nonce()) { 1272 // The agent is running on a different machine to the one it 1273 // should be according to state. It must stop immediately. 1274 logger.Errorf("running machine %v agent on inappropriate instance", m) 1275 return nil, nil, worker.ErrTerminateAgent 1276 } 1277 return st, m, nil 1278 } 1279 1280 // startWorkerAfterUpgrade starts a worker to run the specified child worker 1281 // but only after waiting for upgrades to complete. 1282 func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) { 1283 runner.StartWorker(name, func() (worker.Worker, error) { 1284 return a.upgradeWaiterWorker(start), nil 1285 }) 1286 } 1287 1288 // upgradeWaiterWorker runs the specified worker after upgrades have completed. 1289 func (a *MachineAgent) upgradeWaiterWorker(start func() (worker.Worker, error)) worker.Worker { 1290 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 1291 // Wait for the upgrade to complete (or for us to be stopped). 1292 select { 1293 case <-stop: 1294 return nil 1295 case <-a.upgradeWorkerContext.UpgradeComplete: 1296 } 1297 // Upgrades are done, start the worker. 1298 worker, err := start() 1299 if err != nil { 1300 return err 1301 } 1302 // Wait for worker to finish or for us to be stopped. 1303 waitCh := make(chan error) 1304 go func() { 1305 waitCh <- worker.Wait() 1306 }() 1307 select { 1308 case err := <-waitCh: 1309 return err 1310 case <-stop: 1311 worker.Kill() 1312 } 1313 return <-waitCh // Ensure worker has stopped before returning. 1314 }) 1315 } 1316 1317 func (a *MachineAgent) setMachineStatus(apiState *api.State, status params.Status, info string) error { 1318 tag := a.Tag().(names.MachineTag) 1319 machine, err := apiState.Machiner().Machine(tag) 1320 if err != nil { 1321 return errors.Trace(err) 1322 } 1323 if err := machine.SetStatus(status, info, nil); err != nil { 1324 return errors.Trace(err) 1325 } 1326 return nil 1327 } 1328 1329 // WorkersStarted returns a channel that's closed once all top level workers 1330 // have been started. This is provided for testing purposes. 1331 func (a *MachineAgent) WorkersStarted() <-chan struct{} { 1332 return a.workersStarted 1333 } 1334 1335 func (a *MachineAgent) Tag() names.Tag { 1336 return names.NewMachineTag(a.machineId) 1337 } 1338 1339 func (a *MachineAgent) createJujuRun(dataDir string) error { 1340 // TODO do not remove the symlink if it already points 1341 // to the right place. 1342 if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) { 1343 return err 1344 } 1345 jujud := filepath.Join(dataDir, "tools", a.Tag().String(), jujunames.Jujud) 1346 return symlink.New(jujud, JujuRun) 1347 } 1348 1349 func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error { 1350 var errors []error 1351 agentServiceName := agentConfig.Value(agent.AgentServiceName) 1352 if agentServiceName == "" { 1353 // For backwards compatibility, handle lack of AgentServiceName. 1354 agentServiceName = os.Getenv("UPSTART_JOB") 1355 } 1356 if agentServiceName != "" { 1357 if err := service.NewService(agentServiceName, common.Conf{}).Remove(); err != nil { 1358 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1359 } 1360 } 1361 // Remove the juju-run symlink. 1362 if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) { 1363 errors = append(errors, err) 1364 } 1365 1366 namespace := agentConfig.Value(agent.Namespace) 1367 if err := mongo.RemoveService(namespace); err != nil { 1368 errors = append(errors, fmt.Errorf("cannot stop/remove mongo service with namespace %q: %v", namespace, err)) 1369 } 1370 if err := os.RemoveAll(agentConfig.DataDir()); err != nil { 1371 errors = append(errors, err) 1372 } 1373 if len(errors) == 0 { 1374 return nil 1375 } 1376 return fmt.Errorf("uninstall failed: %v", errors) 1377 } 1378 1379 func newConnRunner(conns ...cmdutil.Pinger) worker.Runner { 1380 return worker.NewRunner(cmdutil.ConnectionIsFatal(logger, conns...), cmdutil.MoreImportant) 1381 } 1382 1383 type MongoSessioner interface { 1384 MongoSession() *mgo.Session 1385 } 1386 1387 func newSingularStateRunner(runner worker.Runner, st MongoSessioner, m *state.Machine) (worker.Runner, error) { 1388 singularStateConn := singularStateConn{st.MongoSession(), m} 1389 singularRunner, err := newSingularRunner(runner, singularStateConn) 1390 if err != nil { 1391 return nil, errors.Annotate(err, "cannot make singular State Runner") 1392 } 1393 return singularRunner, err 1394 } 1395 1396 // singularStateConn implements singular.Conn on 1397 // top of a State connection. 1398 type singularStateConn struct { 1399 session *mgo.Session 1400 machine *state.Machine 1401 } 1402 1403 func (c singularStateConn) IsMaster() (bool, error) { 1404 return mongo.IsMaster(c.session, c.machine) 1405 } 1406 1407 func (c singularStateConn) Ping() error { 1408 return c.session.Ping() 1409 } 1410 1411 func metricAPI(st *api.State) metricsmanager.MetricsManagerClient { 1412 return metricsmanager.NewClient(st) 1413 } 1414 1415 // newDeployContext gives the tests the opportunity to create a deployer.Context 1416 // that can be used for testing so as to avoid (1) deploying units to the system 1417 // running the tests and (2) get access to the *State used internally, so that 1418 // tests can be run without waiting for the 5s watcher refresh time to which we would 1419 // otherwise be restricted. 1420 var newDeployContext = func(st *apideployer.State, agentConfig agent.Config) deployer.Context { 1421 return deployer.NewSimpleContext(agentConfig, st) 1422 }