github.com/rogpeppe/juju@v0.0.0-20140613142852-6337964b789e/cmd/jujud/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package main 5 6 import ( 7 "fmt" 8 "net" 9 "os" 10 "path/filepath" 11 "runtime" 12 "time" 13 14 "github.com/juju/charm" 15 "github.com/juju/cmd" 16 "github.com/juju/errors" 17 "github.com/juju/loggo" 18 "github.com/juju/names" 19 "github.com/juju/utils" 20 "github.com/juju/utils/voyeur" 21 "labix.org/v2/mgo" 22 "launchpad.net/gnuflag" 23 "launchpad.net/tomb" 24 25 "github.com/juju/juju/agent" 26 "github.com/juju/juju/container/kvm" 27 "github.com/juju/juju/environs" 28 "github.com/juju/juju/instance" 29 "github.com/juju/juju/mongo" 30 "github.com/juju/juju/network" 31 "github.com/juju/juju/provider" 32 "github.com/juju/juju/state" 33 "github.com/juju/juju/state/api" 34 apiagent "github.com/juju/juju/state/api/agent" 35 "github.com/juju/juju/state/api/params" 36 "github.com/juju/juju/state/apiserver" 37 "github.com/juju/juju/upgrades" 38 "github.com/juju/juju/upstart" 39 "github.com/juju/juju/version" 40 "github.com/juju/juju/worker" 41 "github.com/juju/juju/worker/apiaddressupdater" 42 "github.com/juju/juju/worker/authenticationworker" 43 "github.com/juju/juju/worker/charmrevisionworker" 44 "github.com/juju/juju/worker/cleaner" 45 "github.com/juju/juju/worker/deployer" 46 "github.com/juju/juju/worker/firewaller" 47 "github.com/juju/juju/worker/instancepoller" 48 "github.com/juju/juju/worker/localstorage" 49 workerlogger "github.com/juju/juju/worker/logger" 50 "github.com/juju/juju/worker/machineenvironmentworker" 51 "github.com/juju/juju/worker/machiner" 52 "github.com/juju/juju/worker/minunitsworker" 53 "github.com/juju/juju/worker/peergrouper" 54 "github.com/juju/juju/worker/provisioner" 55 "github.com/juju/juju/worker/resumer" 56 "github.com/juju/juju/worker/rsyslog" 57 "github.com/juju/juju/worker/singular" 58 "github.com/juju/juju/worker/terminationworker" 59 "github.com/juju/juju/worker/upgrader" 60 ) 61 62 var logger = loggo.GetLogger("juju.cmd.jujud") 63 64 var newRunner = worker.NewRunner 65 66 const bootstrapMachineId = "0" 67 68 // eitherState can be either a *state.State or a *api.State. 69 type eitherState interface{} 70 71 var ( 72 retryDelay = 3 * time.Second 73 jujuRun = "/usr/local/bin/juju-run" 74 useMultipleCPUs = utils.UseMultipleCPUs 75 76 // The following are defined as variables to 77 // allow the tests to intercept calls to the functions. 78 ensureMongoServer = mongo.EnsureServer 79 maybeInitiateMongoServer = peergrouper.MaybeInitiateMongoServer 80 ensureMongoAdminUser = mongo.EnsureAdminUser 81 newSingularRunner = singular.New 82 peergrouperNew = peergrouper.New 83 84 // reportOpenedAPI is exposed for tests to know when 85 // the State has been successfully opened. 86 reportOpenedState = func(eitherState) {} 87 88 // reportOpenedAPI is exposed for tests to know when 89 // the API has been successfully opened. 90 reportOpenedAPI = func(eitherState) {} 91 ) 92 93 // MachineAgent is a cmd.Command responsible for running a machine agent. 94 type MachineAgent struct { 95 cmd.CommandBase 96 tomb tomb.Tomb 97 AgentConf 98 MachineId string 99 runner worker.Runner 100 configChangedVal voyeur.Value 101 upgradeComplete chan struct{} 102 workersStarted chan struct{} 103 st *state.State 104 } 105 106 // Info returns usage information for the command. 107 func (a *MachineAgent) Info() *cmd.Info { 108 return &cmd.Info{ 109 Name: "machine", 110 Purpose: "run a juju machine agent", 111 } 112 } 113 114 func (a *MachineAgent) SetFlags(f *gnuflag.FlagSet) { 115 a.AgentConf.AddFlags(f) 116 f.StringVar(&a.MachineId, "machine-id", "", "id of the machine to run") 117 } 118 119 // Init initializes the command for running. 120 func (a *MachineAgent) Init(args []string) error { 121 if !names.IsMachine(a.MachineId) { 122 return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer") 123 } 124 if err := a.AgentConf.CheckArgs(args); err != nil { 125 return err 126 } 127 a.runner = newRunner(isFatal, moreImportant) 128 a.upgradeComplete = make(chan struct{}) 129 a.workersStarted = make(chan struct{}) 130 return nil 131 } 132 133 // Wait waits for the machine agent to finish. 134 func (a *MachineAgent) Wait() error { 135 return a.tomb.Wait() 136 } 137 138 // Stop stops the machine agent. 139 func (a *MachineAgent) Stop() error { 140 a.runner.Kill() 141 return a.tomb.Wait() 142 } 143 144 // Run runs a machine agent. 145 func (a *MachineAgent) Run(_ *cmd.Context) error { 146 // Due to changes in the logging, and needing to care about old 147 // environments that have been upgraded, we need to explicitly remove the 148 // file writer if one has been added, otherwise we will get duplicate 149 // lines of all logging in the log file. 150 loggo.RemoveWriter("logfile") 151 defer a.tomb.Done() 152 logger.Infof("machine agent %v start (%s [%s])", a.Tag(), version.Current, runtime.Compiler) 153 if err := a.ReadConfig(a.Tag()); err != nil { 154 return fmt.Errorf("cannot read agent configuration: %v", err) 155 } 156 a.configChangedVal.Set(struct{}{}) 157 agentConfig := a.CurrentConfig() 158 charm.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache") 159 if err := a.createJujuRun(agentConfig.DataDir()); err != nil { 160 return fmt.Errorf("cannot create juju run symlink: %v", err) 161 } 162 a.runner.StartWorker("api", a.APIWorker) 163 a.runner.StartWorker("statestarter", a.newStateStarterWorker) 164 a.runner.StartWorker("termination", func() (worker.Worker, error) { 165 return terminationworker.NewWorker(), nil 166 }) 167 // At this point, all workers will have been configured to start 168 close(a.workersStarted) 169 err := a.runner.Wait() 170 if err == worker.ErrTerminateAgent { 171 err = a.uninstallAgent(agentConfig) 172 } 173 err = agentDone(err) 174 a.tomb.Kill(err) 175 return err 176 } 177 178 func (a *MachineAgent) ChangeConfig(mutate func(config agent.ConfigSetter)) error { 179 err := a.AgentConf.ChangeConfig(mutate) 180 a.configChangedVal.Set(struct{}{}) 181 return err 182 } 183 184 // newStateStarterWorker wraps stateStarter in a simple worker for use in 185 // a.runner.StartWorker. 186 func (a *MachineAgent) newStateStarterWorker() (worker.Worker, error) { 187 return worker.NewSimpleWorker(a.stateStarter), nil 188 } 189 190 // stateStarter watches for changes to the agent configuration, and 191 // starts or stops the state worker as appropriate. We watch the agent 192 // configuration because the agent configuration has all the details 193 // that we need to start a state server, whether they have been cached 194 // or read from the state. 195 // 196 // It will stop working as soon as stopch is closed. 197 func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error { 198 confWatch := a.configChangedVal.Watch() 199 defer confWatch.Close() 200 watchCh := make(chan struct{}) 201 go func() { 202 for confWatch.Next() { 203 watchCh <- struct{}{} 204 } 205 }() 206 for { 207 select { 208 case <-watchCh: 209 agentConfig := a.CurrentConfig() 210 211 // N.B. StartWorker and StopWorker are idempotent. 212 _, ok := agentConfig.StateServingInfo() 213 if ok { 214 a.runner.StartWorker("state", func() (worker.Worker, error) { 215 return a.StateWorker() 216 }) 217 } else { 218 a.runner.StopWorker("state") 219 } 220 case <-stopch: 221 return nil 222 } 223 } 224 } 225 226 // APIWorker returns a Worker that connects to the API and starts any 227 // workers that need an API connection. 228 func (a *MachineAgent) APIWorker() (worker.Worker, error) { 229 agentConfig := a.CurrentConfig() 230 st, entity, err := openAPIState(agentConfig, a) 231 if err != nil { 232 return nil, err 233 } 234 reportOpenedAPI(st) 235 236 // Refresh the configuration, since it may have been updated after opening state. 237 agentConfig = a.CurrentConfig() 238 239 for _, job := range entity.Jobs() { 240 if job.NeedsState() { 241 info, err := st.Agent().StateServingInfo() 242 if err != nil { 243 return nil, fmt.Errorf("cannot get state serving info: %v", err) 244 } 245 err = a.ChangeConfig(func(config agent.ConfigSetter) { 246 config.SetStateServingInfo(info) 247 }) 248 if err != nil { 249 return nil, err 250 } 251 agentConfig = a.CurrentConfig() 252 break 253 } 254 } 255 256 rsyslogMode := rsyslog.RsyslogModeForwarding 257 runner := newRunner(connectionIsFatal(st), moreImportant) 258 var singularRunner worker.Runner 259 for _, job := range entity.Jobs() { 260 if job == params.JobManageEnviron { 261 rsyslogMode = rsyslog.RsyslogModeAccumulate 262 conn := singularAPIConn{st, st.Agent()} 263 singularRunner, err = newSingularRunner(runner, conn) 264 if err != nil { 265 return nil, fmt.Errorf("cannot make singular API Runner: %v", err) 266 } 267 break 268 } 269 } 270 271 // Run the upgrader and the upgrade-steps worker without waiting for 272 // the upgrade steps to complete. 273 runner.StartWorker("upgrader", func() (worker.Worker, error) { 274 return upgrader.NewUpgrader(st.Upgrader(), agentConfig), nil 275 }) 276 runner.StartWorker("upgrade-steps", func() (worker.Worker, error) { 277 return a.upgradeWorker(st, entity.Jobs(), agentConfig), nil 278 }) 279 280 // All other workers must wait for the upgrade steps to complete 281 // before starting. 282 a.startWorkerAfterUpgrade(runner, "machiner", func() (worker.Worker, error) { 283 return machiner.NewMachiner(st.Machiner(), agentConfig), nil 284 }) 285 a.startWorkerAfterUpgrade(runner, "apiaddressupdater", func() (worker.Worker, error) { 286 return apiaddressupdater.NewAPIAddressUpdater(st.Machiner(), a), nil 287 }) 288 a.startWorkerAfterUpgrade(runner, "logger", func() (worker.Worker, error) { 289 return workerlogger.NewLogger(st.Logger(), agentConfig), nil 290 }) 291 a.startWorkerAfterUpgrade(runner, "machineenvironmentworker", func() (worker.Worker, error) { 292 return machineenvironmentworker.NewMachineEnvironmentWorker(st.Environment(), agentConfig), nil 293 }) 294 a.startWorkerAfterUpgrade(runner, "rsyslog", func() (worker.Worker, error) { 295 return newRsyslogConfigWorker(st.Rsyslog(), agentConfig, rsyslogMode) 296 }) 297 298 // If not a local provider bootstrap machine, start the worker to 299 // manage SSH keys. 300 providerType := agentConfig.Value(agent.ProviderType) 301 if providerType != provider.Local || a.MachineId != bootstrapMachineId { 302 a.startWorkerAfterUpgrade(runner, "authenticationworker", func() (worker.Worker, error) { 303 return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil 304 }) 305 } 306 307 // Perform the operations needed to set up hosting for containers. 308 if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil { 309 return nil, fmt.Errorf("setting up container support: %v", err) 310 } 311 for _, job := range entity.Jobs() { 312 switch job { 313 case params.JobHostUnits: 314 a.startWorkerAfterUpgrade(runner, "deployer", func() (worker.Worker, error) { 315 apiDeployer := st.Deployer() 316 context := newDeployContext(apiDeployer, agentConfig) 317 return deployer.NewDeployer(apiDeployer, context), nil 318 }) 319 case params.JobManageEnviron: 320 a.startWorkerAfterUpgrade(singularRunner, "environ-provisioner", func() (worker.Worker, error) { 321 return provisioner.NewEnvironProvisioner(st.Provisioner(), agentConfig), nil 322 }) 323 // TODO(axw) 2013-09-24 bug #1229506 324 // Make another job to enable the firewaller. Not all 325 // environments are capable of managing ports 326 // centrally. 327 a.startWorkerAfterUpgrade(singularRunner, "firewaller", func() (worker.Worker, error) { 328 return firewaller.NewFirewaller(st.Firewaller()) 329 }) 330 a.startWorkerAfterUpgrade(singularRunner, "charm-revision-updater", func() (worker.Worker, error) { 331 return charmrevisionworker.NewRevisionUpdateWorker(st.CharmRevisionUpdater()), nil 332 }) 333 case params.JobManageStateDeprecated: 334 // Legacy environments may set this, but we ignore it. 335 default: 336 // TODO(dimitern): Once all workers moved over to using 337 // the API, report "unknown job type" here. 338 } 339 } 340 return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 341 } 342 343 // setupContainerSupport determines what containers can be run on this machine and 344 // initialises suitable infrastructure to support such containers. 345 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity, agentConfig agent.Config) error { 346 var supportedContainers []instance.ContainerType 347 // We don't yet support nested lxc containers but anything else can run an LXC container. 348 if entity.ContainerType() != instance.LXC { 349 supportedContainers = append(supportedContainers, instance.LXC) 350 } 351 supportsKvm, err := kvm.IsKVMSupported() 352 if err != nil { 353 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 354 } 355 if err == nil && supportsKvm { 356 supportedContainers = append(supportedContainers, instance.KVM) 357 } 358 return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig) 359 } 360 361 // updateSupportedContainers records in state that a machine can run the specified containers. 362 // It starts a watcher and when a container of a given type is first added to the machine, 363 // the watcher is killed, the machine is set up to be able to start containers of the given type, 364 // and a suitable provisioner is started. 365 func (a *MachineAgent) updateSupportedContainers( 366 runner worker.Runner, 367 st *api.State, 368 tag string, 369 containers []instance.ContainerType, 370 agentConfig agent.Config, 371 ) error { 372 pr := st.Provisioner() 373 machine, err := pr.Machine(tag) 374 if err != nil { 375 return fmt.Errorf("%s is not in state: %v", tag, err) 376 } 377 if len(containers) == 0 { 378 if err := machine.SupportsNoContainers(); err != nil { 379 return fmt.Errorf("clearing supported containers for %s: %v", tag, err) 380 } 381 return nil 382 } 383 if err := machine.SetSupportedContainers(containers...); err != nil { 384 return fmt.Errorf("setting supported containers for %s: %v", tag, err) 385 } 386 initLock, err := hookExecutionLock(agentConfig.DataDir()) 387 if err != nil { 388 return err 389 } 390 // Start the watcher to fire when a container is first requested on the machine. 391 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 392 handler := provisioner.NewContainerSetupHandler( 393 runner, 394 watcherName, 395 containers, 396 machine, 397 pr, 398 agentConfig, 399 initLock, 400 ) 401 a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) { 402 return worker.NewStringsWorker(handler), nil 403 }) 404 return nil 405 } 406 407 // StateWorker returns a worker running all the workers that require 408 // a *state.State connection. 409 func (a *MachineAgent) StateWorker() (worker.Worker, error) { 410 agentConfig := a.CurrentConfig() 411 412 // Create system-identity file 413 if err := agent.WriteSystemIdentityFile(agentConfig); err != nil { 414 return nil, err 415 } 416 417 // Start MondoDB server 418 if err := a.ensureMongoServer(agentConfig); err != nil { 419 return nil, err 420 } 421 st, m, err := openState(agentConfig) 422 if err != nil { 423 return nil, err 424 } 425 reportOpenedState(st) 426 427 singularStateConn := singularStateConn{st.MongoSession(), m} 428 runner := newRunner(connectionIsFatal(st), moreImportant) 429 singularRunner, err := newSingularRunner(runner, singularStateConn) 430 if err != nil { 431 return nil, fmt.Errorf("cannot make singular State Runner: %v", err) 432 } 433 434 // Take advantage of special knowledge here in that we will only ever want 435 // the storage provider on one machine, and that is the "bootstrap" node. 436 providerType := agentConfig.Value(agent.ProviderType) 437 if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId { 438 a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) { 439 // TODO(axw) 2013-09-24 bug #1229507 440 // Make another job to enable storage. 441 // There's nothing special about this. 442 return localstorage.NewWorker(agentConfig), nil 443 }) 444 } 445 for _, job := range m.Jobs() { 446 switch job { 447 case state.JobHostUnits: 448 // Implemented in APIWorker. 449 case state.JobManageEnviron: 450 useMultipleCPUs() 451 a.startWorkerAfterUpgrade(runner, "instancepoller", func() (worker.Worker, error) { 452 return instancepoller.NewWorker(st), nil 453 }) 454 a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) { 455 return peergrouperNew(st) 456 }) 457 runner.StartWorker("apiserver", func() (worker.Worker, error) { 458 // If the configuration does not have the required information, 459 // it is currently not a recoverable error, so we kill the whole 460 // agent, potentially enabling human intervention to fix 461 // the agent's configuration file. In the future, we may retrieve 462 // the state server certificate and key from the state, and 463 // this should then change. 464 info, ok := agentConfig.StateServingInfo() 465 if !ok { 466 return nil, &fatalError{"StateServingInfo not available and we need it"} 467 } 468 port := info.APIPort 469 cert := []byte(info.Cert) 470 key := []byte(info.PrivateKey) 471 472 if len(cert) == 0 || len(key) == 0 { 473 return nil, &fatalError{"configuration does not have state server cert/key"} 474 } 475 dataDir := agentConfig.DataDir() 476 logDir := agentConfig.LogDir() 477 return apiserver.NewServer(st, apiserver.ServerConfig{ 478 Addr: fmt.Sprintf(":%d", port), 479 Cert: cert, 480 Key: key, 481 DataDir: dataDir, 482 LogDir: logDir, 483 }) 484 }) 485 a.startWorkerAfterUpgrade(singularRunner, "cleaner", func() (worker.Worker, error) { 486 return cleaner.NewCleaner(st), nil 487 }) 488 a.startWorkerAfterUpgrade(singularRunner, "resumer", func() (worker.Worker, error) { 489 // The action of resumer is so subtle that it is not tested, 490 // because we can't figure out how to do so without brutalising 491 // the transaction log. 492 return resumer.NewResumer(st), nil 493 }) 494 a.startWorkerAfterUpgrade(singularRunner, "minunitsworker", func() (worker.Worker, error) { 495 return minunitsworker.NewMinUnitsWorker(st), nil 496 }) 497 case state.JobManageStateDeprecated: 498 // Legacy environments may set this, but we ignore it. 499 default: 500 logger.Warningf("ignoring unknown job %q", job) 501 } 502 } 503 return newCloseWorker(runner, st), nil 504 } 505 506 // ensureMongoServer ensures that mongo is installed and running, 507 // and ready for opening a state connection. 508 func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) error { 509 servingInfo, ok := agentConfig.StateServingInfo() 510 if !ok { 511 return fmt.Errorf("state worker was started with no state serving info") 512 } 513 namespace := agentConfig.Value(agent.Namespace) 514 515 // When upgrading from a pre-HA-capable environment, 516 // we must add machine-0 to the admin database and 517 // initiate its replicaset. 518 // 519 // TODO(axw) remove this when we no longer need 520 // to upgrade from pre-HA-capable environments. 521 var shouldInitiateMongoServer bool 522 var addrs []network.Address 523 if isPreHAVersion(agentConfig.UpgradedToVersion()) { 524 _, err := a.ensureMongoAdminUser(agentConfig) 525 if err != nil { 526 return err 527 } 528 if servingInfo.SharedSecret == "" { 529 servingInfo.SharedSecret, err = mongo.GenerateSharedSecret() 530 if err != nil { 531 return err 532 } 533 if err = a.ChangeConfig(func(config agent.ConfigSetter) { 534 config.SetStateServingInfo(servingInfo) 535 }); err != nil { 536 return err 537 } 538 agentConfig = a.CurrentConfig() 539 } 540 st, m, err := openState(agentConfig) 541 if err != nil { 542 return err 543 } 544 if err := st.SetStateServingInfo(servingInfo); err != nil { 545 st.Close() 546 return fmt.Errorf("cannot set state serving info: %v", err) 547 } 548 st.Close() 549 addrs = m.Addresses() 550 shouldInitiateMongoServer = true 551 } 552 553 // ensureMongoServer installs/upgrades the upstart config as necessary. 554 if err := ensureMongoServer( 555 agentConfig.DataDir(), 556 namespace, 557 servingInfo, 558 ); err != nil { 559 return err 560 } 561 if !shouldInitiateMongoServer { 562 return nil 563 } 564 565 // Initiate the replicaset for upgraded environments. 566 // 567 // TODO(axw) remove this when we no longer need 568 // to upgrade from pre-HA-capable environments. 569 stateInfo, ok := agentConfig.StateInfo() 570 if !ok { 571 return fmt.Errorf("state worker was started with no state serving info") 572 } 573 dialInfo, err := mongo.DialInfo(stateInfo.Info, mongo.DefaultDialOpts()) 574 if err != nil { 575 return err 576 } 577 peerAddr := mongo.SelectPeerAddress(addrs) 578 if peerAddr == "" { 579 return fmt.Errorf("no appropriate peer address found in %q", addrs) 580 } 581 return maybeInitiateMongoServer(peergrouper.InitiateMongoParams{ 582 DialInfo: dialInfo, 583 MemberHostPort: net.JoinHostPort(peerAddr, fmt.Sprint(servingInfo.StatePort)), 584 User: stateInfo.Tag, 585 Password: stateInfo.Password, 586 }) 587 } 588 589 func (a *MachineAgent) ensureMongoAdminUser(agentConfig agent.Config) (added bool, err error) { 590 stateInfo, ok1 := agentConfig.StateInfo() 591 servingInfo, ok2 := agentConfig.StateServingInfo() 592 if !ok1 || !ok2 { 593 return false, fmt.Errorf("no state serving info configuration") 594 } 595 dialInfo, err := mongo.DialInfo(stateInfo.Info, mongo.DefaultDialOpts()) 596 if err != nil { 597 return false, err 598 } 599 if len(dialInfo.Addrs) > 1 { 600 logger.Infof("more than one state server; admin user must exist") 601 return false, nil 602 } 603 return ensureMongoAdminUser(mongo.EnsureAdminUserParams{ 604 DialInfo: dialInfo, 605 Namespace: agentConfig.Value(agent.Namespace), 606 DataDir: agentConfig.DataDir(), 607 Port: servingInfo.StatePort, 608 User: stateInfo.Tag, 609 Password: stateInfo.Password, 610 }) 611 } 612 613 func isPreHAVersion(v version.Number) bool { 614 return v.Compare(version.MustParse("1.19.0")) < 0 615 } 616 617 func openState(agentConfig agent.Config) (_ *state.State, _ *state.Machine, err error) { 618 info, ok := agentConfig.StateInfo() 619 if !ok { 620 return nil, nil, fmt.Errorf("no state info available") 621 } 622 st, err := state.Open(info, mongo.DialOpts{}, environs.NewStatePolicy()) 623 if err != nil { 624 return nil, nil, err 625 } 626 defer func() { 627 if err != nil { 628 st.Close() 629 } 630 }() 631 m0, err := st.FindEntity(agentConfig.Tag()) 632 if err != nil { 633 if errors.IsNotFound(err) { 634 err = worker.ErrTerminateAgent 635 } 636 return nil, nil, err 637 } 638 m := m0.(*state.Machine) 639 if m.Life() == state.Dead { 640 return nil, nil, worker.ErrTerminateAgent 641 } 642 // Check the machine nonce as provisioned matches the agent.Conf value. 643 if !m.CheckProvisioned(agentConfig.Nonce()) { 644 // The agent is running on a different machine to the one it 645 // should be according to state. It must stop immediately. 646 logger.Errorf("running machine %v agent on inappropriate instance", m) 647 return nil, nil, worker.ErrTerminateAgent 648 } 649 return st, m, nil 650 } 651 652 // startWorkerAfterUpgrade starts a worker to run the specified child worker 653 // but only after waiting for upgrades to complete. 654 func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) { 655 runner.StartWorker(name, func() (worker.Worker, error) { 656 return a.upgradeWaiterWorker(start), nil 657 }) 658 } 659 660 // upgradeWaiterWorker runs the specified worker after upgrades have completed. 661 func (a *MachineAgent) upgradeWaiterWorker(start func() (worker.Worker, error)) worker.Worker { 662 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 663 // Wait for the upgrade to complete (or for us to be stopped). 664 select { 665 case <-stop: 666 return nil 667 case <-a.upgradeComplete: 668 } 669 // Upgrades are done, start the worker. 670 worker, err := start() 671 if err != nil { 672 return err 673 } 674 // Wait for worker to finish or for us to be stopped. 675 waitCh := make(chan error) 676 go func() { 677 waitCh <- worker.Wait() 678 }() 679 select { 680 case err := <-waitCh: 681 return err 682 case <-stop: 683 worker.Kill() 684 } 685 return <-waitCh // Ensure worker has stopped before returning. 686 }) 687 } 688 689 // upgradeWorker runs the required upgrade operations to upgrade to the current Juju version. 690 func (a *MachineAgent) upgradeWorker( 691 apiState *api.State, 692 jobs []params.MachineJob, 693 agentConfig agent.Config, 694 ) worker.Worker { 695 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 696 select { 697 case <-a.upgradeComplete: 698 // Our work is already done (we're probably being restarted 699 // because the API connection has gone down), so do nothing. 700 <-stop 701 return nil 702 default: 703 } 704 // If the machine agent is a state server, flag that state 705 // needs to be opened before running upgrade steps 706 needsState := false 707 for _, job := range jobs { 708 if job == params.JobManageEnviron { 709 needsState = true 710 } 711 } 712 // We need a *state.State for upgrades. We open it independently 713 // of StateWorker, because we have no guarantees about when 714 // and how often StateWorker might run. 715 var st *state.State 716 if needsState { 717 var err error 718 info, ok := agentConfig.StateInfo() 719 if !ok { 720 return fmt.Errorf("no state info available") 721 } 722 st, err = state.Open(info, mongo.DialOpts{}, environs.NewStatePolicy()) 723 if err != nil { 724 return err 725 } 726 defer st.Close() 727 } 728 err := a.runUpgrades(st, apiState, jobs, agentConfig) 729 if err != nil { 730 return err 731 } 732 logger.Infof("upgrade to %v completed.", version.Current) 733 close(a.upgradeComplete) 734 <-stop 735 return nil 736 }) 737 } 738 739 // runUpgrades runs the upgrade operations for each job type and updates the updatedToVersion on success. 740 func (a *MachineAgent) runUpgrades( 741 st *state.State, 742 apiState *api.State, 743 jobs []params.MachineJob, 744 agentConfig agent.Config, 745 ) error { 746 from := version.Current 747 from.Number = agentConfig.UpgradedToVersion() 748 if from == version.Current { 749 logger.Infof("upgrade to %v already completed.", version.Current) 750 return nil 751 } 752 var err error 753 writeErr := a.ChangeConfig(func(agentConfig agent.ConfigSetter) { 754 context := upgrades.NewContext(agentConfig, apiState, st) 755 for _, job := range jobs { 756 target := upgradeTarget(job) 757 if target == "" { 758 continue 759 } 760 logger.Infof("starting upgrade from %v to %v for %v %q", from, version.Current, target, a.Tag()) 761 if err = upgrades.PerformUpgrade(from.Number, target, context); err != nil { 762 err = fmt.Errorf("cannot perform upgrade from %v to %v for %v %q: %v", from, version.Current, target, a.Tag(), err) 763 return 764 } 765 } 766 agentConfig.SetUpgradedToVersion(version.Current.Number) 767 }) 768 if writeErr != nil { 769 return fmt.Errorf("cannot write updated agent configuration: %v", writeErr) 770 } 771 return nil 772 } 773 774 func upgradeTarget(job params.MachineJob) upgrades.Target { 775 switch job { 776 case params.JobManageEnviron: 777 return upgrades.StateServer 778 case params.JobHostUnits: 779 return upgrades.HostMachine 780 } 781 return "" 782 } 783 784 // WorkersStarted returns a channel that's closed once all top level workers 785 // have been started. This is provided for testing purposes. 786 func (a *MachineAgent) WorkersStarted() <-chan struct{} { 787 return a.workersStarted 788 } 789 790 func (a *MachineAgent) Tag() string { 791 return names.NewMachineTag(a.MachineId).String() 792 } 793 794 func (a *MachineAgent) createJujuRun(dataDir string) error { 795 // TODO do not remove the symlink if it already points 796 // to the right place. 797 if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) { 798 return err 799 } 800 jujud := filepath.Join(dataDir, "tools", a.Tag(), "jujud") 801 return os.Symlink(jujud, jujuRun) 802 } 803 804 func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error { 805 var errors []error 806 agentServiceName := agentConfig.Value(agent.AgentServiceName) 807 if agentServiceName == "" { 808 // For backwards compatibility, handle lack of AgentServiceName. 809 agentServiceName = os.Getenv("UPSTART_JOB") 810 } 811 if agentServiceName != "" { 812 if err := upstart.NewService(agentServiceName).Remove(); err != nil { 813 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 814 } 815 } 816 // Remove the juju-run symlink. 817 if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) { 818 errors = append(errors, err) 819 } 820 821 namespace := agentConfig.Value(agent.Namespace) 822 if err := mongo.RemoveService(namespace); err != nil { 823 errors = append(errors, fmt.Errorf("cannot stop/remove mongo service with namespace %q: %v", namespace, err)) 824 } 825 if err := os.RemoveAll(agentConfig.DataDir()); err != nil { 826 errors = append(errors, err) 827 } 828 if len(errors) == 0 { 829 return nil 830 } 831 return fmt.Errorf("uninstall failed: %v", errors) 832 } 833 834 // singularAPIConn implements singular.Conn on 835 // top of an API connection. 836 type singularAPIConn struct { 837 apiState *api.State 838 agentState *apiagent.State 839 } 840 841 func (c singularAPIConn) IsMaster() (bool, error) { 842 return c.agentState.IsMaster() 843 } 844 845 func (c singularAPIConn) Ping() error { 846 return c.apiState.Ping() 847 } 848 849 // singularStateConn implements singular.Conn on 850 // top of a State connection. 851 type singularStateConn struct { 852 session *mgo.Session 853 machine *state.Machine 854 } 855 856 func (c singularStateConn) IsMaster() (bool, error) { 857 return mongo.IsMaster(c.session, c.machine) 858 } 859 860 func (c singularStateConn) Ping() error { 861 return c.session.Ping() 862 }