github.com/mattyw/juju@v0.0.0-20140610034352-732aecd63861/cmd/jujud/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package main 5 6 import ( 7 "fmt" 8 "net" 9 "os" 10 "path/filepath" 11 "runtime" 12 "time" 13 14 "github.com/juju/errors" 15 "github.com/juju/loggo" 16 "github.com/juju/names" 17 "github.com/juju/utils" 18 "github.com/juju/utils/voyeur" 19 "labix.org/v2/mgo" 20 "launchpad.net/gnuflag" 21 "launchpad.net/tomb" 22 23 "github.com/juju/juju/agent" 24 "github.com/juju/juju/agent/mongo" 25 "github.com/juju/juju/charm" 26 "github.com/juju/juju/cmd" 27 "github.com/juju/juju/container/kvm" 28 "github.com/juju/juju/environs" 29 "github.com/juju/juju/instance" 30 "github.com/juju/juju/provider" 31 "github.com/juju/juju/state" 32 "github.com/juju/juju/state/api" 33 apiagent "github.com/juju/juju/state/api/agent" 34 "github.com/juju/juju/state/api/params" 35 "github.com/juju/juju/state/apiserver" 36 "github.com/juju/juju/upgrades" 37 "github.com/juju/juju/upstart" 38 "github.com/juju/juju/version" 39 "github.com/juju/juju/worker" 40 "github.com/juju/juju/worker/apiaddressupdater" 41 "github.com/juju/juju/worker/authenticationworker" 42 "github.com/juju/juju/worker/charmrevisionworker" 43 "github.com/juju/juju/worker/cleaner" 44 "github.com/juju/juju/worker/deployer" 45 "github.com/juju/juju/worker/firewaller" 46 "github.com/juju/juju/worker/instancepoller" 47 "github.com/juju/juju/worker/localstorage" 48 workerlogger "github.com/juju/juju/worker/logger" 49 "github.com/juju/juju/worker/machineenvironmentworker" 50 "github.com/juju/juju/worker/machiner" 51 "github.com/juju/juju/worker/minunitsworker" 52 "github.com/juju/juju/worker/peergrouper" 53 "github.com/juju/juju/worker/provisioner" 54 "github.com/juju/juju/worker/resumer" 55 "github.com/juju/juju/worker/rsyslog" 56 "github.com/juju/juju/worker/singular" 57 "github.com/juju/juju/worker/terminationworker" 58 "github.com/juju/juju/worker/upgrader" 59 ) 60 61 var logger = loggo.GetLogger("juju.cmd.jujud") 62 63 var newRunner = worker.NewRunner 64 65 const bootstrapMachineId = "0" 66 67 // eitherState can be either a *state.State or a *api.State. 68 type eitherState interface{} 69 70 var ( 71 retryDelay = 3 * time.Second 72 jujuRun = "/usr/local/bin/juju-run" 73 useMultipleCPUs = utils.UseMultipleCPUs 74 75 // The following are defined as variables to 76 // allow the tests to intercept calls to the functions. 77 ensureMongoServer = mongo.EnsureServer 78 maybeInitiateMongoServer = peergrouper.MaybeInitiateMongoServer 79 ensureMongoAdminUser = mongo.EnsureAdminUser 80 newSingularRunner = singular.New 81 peergrouperNew = peergrouper.New 82 83 // reportOpenedAPI is exposed for tests to know when 84 // the State has been successfully opened. 85 reportOpenedState = func(eitherState) {} 86 87 // reportOpenedAPI is exposed for tests to know when 88 // the API has been successfully opened. 89 reportOpenedAPI = func(eitherState) {} 90 ) 91 92 // MachineAgent is a cmd.Command responsible for running a machine agent. 93 type MachineAgent struct { 94 cmd.CommandBase 95 tomb tomb.Tomb 96 AgentConf 97 MachineId string 98 runner worker.Runner 99 configChangedVal voyeur.Value 100 upgradeComplete chan struct{} 101 workersStarted chan struct{} 102 st *state.State 103 } 104 105 // Info returns usage information for the command. 106 func (a *MachineAgent) Info() *cmd.Info { 107 return &cmd.Info{ 108 Name: "machine", 109 Purpose: "run a juju machine agent", 110 } 111 } 112 113 func (a *MachineAgent) SetFlags(f *gnuflag.FlagSet) { 114 a.AgentConf.AddFlags(f) 115 f.StringVar(&a.MachineId, "machine-id", "", "id of the machine to run") 116 } 117 118 // Init initializes the command for running. 119 func (a *MachineAgent) Init(args []string) error { 120 if !names.IsMachine(a.MachineId) { 121 return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer") 122 } 123 if err := a.AgentConf.CheckArgs(args); err != nil { 124 return err 125 } 126 a.runner = newRunner(isFatal, moreImportant) 127 a.upgradeComplete = make(chan struct{}) 128 a.workersStarted = make(chan struct{}) 129 return nil 130 } 131 132 // Wait waits for the machine agent to finish. 133 func (a *MachineAgent) Wait() error { 134 return a.tomb.Wait() 135 } 136 137 // Stop stops the machine agent. 138 func (a *MachineAgent) Stop() error { 139 a.runner.Kill() 140 return a.tomb.Wait() 141 } 142 143 // Run runs a machine agent. 144 func (a *MachineAgent) Run(_ *cmd.Context) error { 145 // Due to changes in the logging, and needing to care about old 146 // environments that have been upgraded, we need to explicitly remove the 147 // file writer if one has been added, otherwise we will get duplicate 148 // lines of all logging in the log file. 149 loggo.RemoveWriter("logfile") 150 defer a.tomb.Done() 151 logger.Infof("machine agent %v start (%s [%s])", a.Tag(), version.Current, runtime.Compiler) 152 if err := a.ReadConfig(a.Tag()); err != nil { 153 return fmt.Errorf("cannot read agent configuration: %v", err) 154 } 155 a.configChangedVal.Set(struct{}{}) 156 agentConfig := a.CurrentConfig() 157 charm.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache") 158 if err := a.createJujuRun(agentConfig.DataDir()); err != nil { 159 return fmt.Errorf("cannot create juju run symlink: %v", err) 160 } 161 a.runner.StartWorker("api", a.APIWorker) 162 a.runner.StartWorker("statestarter", a.newStateStarterWorker) 163 a.runner.StartWorker("termination", func() (worker.Worker, error) { 164 return terminationworker.NewWorker(), nil 165 }) 166 // At this point, all workers will have been configured to start 167 close(a.workersStarted) 168 err := a.runner.Wait() 169 if err == worker.ErrTerminateAgent { 170 err = a.uninstallAgent(agentConfig) 171 } 172 err = agentDone(err) 173 a.tomb.Kill(err) 174 return err 175 } 176 177 func (a *MachineAgent) ChangeConfig(mutate func(config agent.ConfigSetter)) error { 178 err := a.AgentConf.ChangeConfig(mutate) 179 a.configChangedVal.Set(struct{}{}) 180 return err 181 } 182 183 // newStateStarterWorker wraps stateStarter in a simple worker for use in 184 // a.runner.StartWorker. 185 func (a *MachineAgent) newStateStarterWorker() (worker.Worker, error) { 186 return worker.NewSimpleWorker(a.stateStarter), nil 187 } 188 189 // stateStarter watches for changes to the agent configuration, and 190 // starts or stops the state worker as appropriate. We watch the agent 191 // configuration because the agent configuration has all the details 192 // that we need to start a state server, whether they have been cached 193 // or read from the state. 194 // 195 // It will stop working as soon as stopch is closed. 196 func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error { 197 confWatch := a.configChangedVal.Watch() 198 defer confWatch.Close() 199 watchCh := make(chan struct{}) 200 go func() { 201 for confWatch.Next() { 202 watchCh <- struct{}{} 203 } 204 }() 205 for { 206 select { 207 case <-watchCh: 208 agentConfig := a.CurrentConfig() 209 210 // N.B. StartWorker and StopWorker are idempotent. 211 _, ok := agentConfig.StateServingInfo() 212 if ok { 213 a.runner.StartWorker("state", func() (worker.Worker, error) { 214 return a.StateWorker() 215 }) 216 } else { 217 a.runner.StopWorker("state") 218 } 219 case <-stopch: 220 return nil 221 } 222 } 223 } 224 225 // APIWorker returns a Worker that connects to the API and starts any 226 // workers that need an API connection. 227 func (a *MachineAgent) APIWorker() (worker.Worker, error) { 228 agentConfig := a.CurrentConfig() 229 st, entity, err := openAPIState(agentConfig, a) 230 if err != nil { 231 return nil, err 232 } 233 reportOpenedAPI(st) 234 235 // Refresh the configuration, since it may have been updated after opening state. 236 agentConfig = a.CurrentConfig() 237 238 for _, job := range entity.Jobs() { 239 if job.NeedsState() { 240 info, err := st.Agent().StateServingInfo() 241 if err != nil { 242 return nil, fmt.Errorf("cannot get state serving info: %v", err) 243 } 244 err = a.ChangeConfig(func(config agent.ConfigSetter) { 245 config.SetStateServingInfo(info) 246 }) 247 if err != nil { 248 return nil, err 249 } 250 agentConfig = a.CurrentConfig() 251 break 252 } 253 } 254 255 rsyslogMode := rsyslog.RsyslogModeForwarding 256 runner := newRunner(connectionIsFatal(st), moreImportant) 257 var singularRunner worker.Runner 258 for _, job := range entity.Jobs() { 259 if job == params.JobManageEnviron { 260 rsyslogMode = rsyslog.RsyslogModeAccumulate 261 conn := singularAPIConn{st, st.Agent()} 262 singularRunner, err = newSingularRunner(runner, conn) 263 if err != nil { 264 return nil, fmt.Errorf("cannot make singular API Runner: %v", err) 265 } 266 break 267 } 268 } 269 270 // Run the upgrader and the upgrade-steps worker without waiting for 271 // the upgrade steps to complete. 272 runner.StartWorker("upgrader", func() (worker.Worker, error) { 273 return upgrader.NewUpgrader(st.Upgrader(), agentConfig), nil 274 }) 275 runner.StartWorker("upgrade-steps", func() (worker.Worker, error) { 276 return a.upgradeWorker(st, entity.Jobs(), agentConfig), nil 277 }) 278 279 // All other workers must wait for the upgrade steps to complete 280 // before starting. 281 a.startWorkerAfterUpgrade(runner, "machiner", func() (worker.Worker, error) { 282 return machiner.NewMachiner(st.Machiner(), agentConfig), nil 283 }) 284 a.startWorkerAfterUpgrade(runner, "apiaddressupdater", func() (worker.Worker, error) { 285 return apiaddressupdater.NewAPIAddressUpdater(st.Machiner(), a), nil 286 }) 287 a.startWorkerAfterUpgrade(runner, "logger", func() (worker.Worker, error) { 288 return workerlogger.NewLogger(st.Logger(), agentConfig), nil 289 }) 290 a.startWorkerAfterUpgrade(runner, "machineenvironmentworker", func() (worker.Worker, error) { 291 return machineenvironmentworker.NewMachineEnvironmentWorker(st.Environment(), agentConfig), nil 292 }) 293 a.startWorkerAfterUpgrade(runner, "rsyslog", func() (worker.Worker, error) { 294 return newRsyslogConfigWorker(st.Rsyslog(), agentConfig, rsyslogMode) 295 }) 296 297 // If not a local provider bootstrap machine, start the worker to 298 // manage SSH keys. 299 providerType := agentConfig.Value(agent.ProviderType) 300 if providerType != provider.Local || a.MachineId != bootstrapMachineId { 301 a.startWorkerAfterUpgrade(runner, "authenticationworker", func() (worker.Worker, error) { 302 return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil 303 }) 304 } 305 306 // Perform the operations needed to set up hosting for containers. 307 if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil { 308 return nil, fmt.Errorf("setting up container support: %v", err) 309 } 310 for _, job := range entity.Jobs() { 311 switch job { 312 case params.JobHostUnits: 313 a.startWorkerAfterUpgrade(runner, "deployer", func() (worker.Worker, error) { 314 apiDeployer := st.Deployer() 315 context := newDeployContext(apiDeployer, agentConfig) 316 return deployer.NewDeployer(apiDeployer, context), nil 317 }) 318 case params.JobManageEnviron: 319 a.startWorkerAfterUpgrade(singularRunner, "environ-provisioner", func() (worker.Worker, error) { 320 return provisioner.NewEnvironProvisioner(st.Provisioner(), agentConfig), nil 321 }) 322 // TODO(axw) 2013-09-24 bug #1229506 323 // Make another job to enable the firewaller. Not all 324 // environments are capable of managing ports 325 // centrally. 326 a.startWorkerAfterUpgrade(singularRunner, "firewaller", func() (worker.Worker, error) { 327 return firewaller.NewFirewaller(st.Firewaller()) 328 }) 329 a.startWorkerAfterUpgrade(singularRunner, "charm-revision-updater", func() (worker.Worker, error) { 330 return charmrevisionworker.NewRevisionUpdateWorker(st.CharmRevisionUpdater()), nil 331 }) 332 case params.JobManageStateDeprecated: 333 // Legacy environments may set this, but we ignore it. 334 default: 335 // TODO(dimitern): Once all workers moved over to using 336 // the API, report "unknown job type" here. 337 } 338 } 339 return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 340 } 341 342 // setupContainerSupport determines what containers can be run on this machine and 343 // initialises suitable infrastructure to support such containers. 344 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity, agentConfig agent.Config) error { 345 var supportedContainers []instance.ContainerType 346 // We don't yet support nested lxc containers but anything else can run an LXC container. 347 if entity.ContainerType() != instance.LXC { 348 supportedContainers = append(supportedContainers, instance.LXC) 349 } 350 supportsKvm, err := kvm.IsKVMSupported() 351 if err != nil { 352 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 353 } 354 if err == nil && supportsKvm { 355 supportedContainers = append(supportedContainers, instance.KVM) 356 } 357 return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig) 358 } 359 360 // updateSupportedContainers records in state that a machine can run the specified containers. 361 // It starts a watcher and when a container of a given type is first added to the machine, 362 // the watcher is killed, the machine is set up to be able to start containers of the given type, 363 // and a suitable provisioner is started. 364 func (a *MachineAgent) updateSupportedContainers( 365 runner worker.Runner, 366 st *api.State, 367 tag string, 368 containers []instance.ContainerType, 369 agentConfig agent.Config, 370 ) error { 371 pr := st.Provisioner() 372 machine, err := pr.Machine(tag) 373 if err != nil { 374 return fmt.Errorf("%s is not in state: %v", tag, err) 375 } 376 if len(containers) == 0 { 377 if err := machine.SupportsNoContainers(); err != nil { 378 return fmt.Errorf("clearing supported containers for %s: %v", tag, err) 379 } 380 return nil 381 } 382 if err := machine.SetSupportedContainers(containers...); err != nil { 383 return fmt.Errorf("setting supported containers for %s: %v", tag, err) 384 } 385 initLock, err := hookExecutionLock(agentConfig.DataDir()) 386 if err != nil { 387 return err 388 } 389 // Start the watcher to fire when a container is first requested on the machine. 390 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 391 handler := provisioner.NewContainerSetupHandler( 392 runner, 393 watcherName, 394 containers, 395 machine, 396 pr, 397 agentConfig, 398 initLock, 399 ) 400 a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) { 401 return worker.NewStringsWorker(handler), nil 402 }) 403 return nil 404 } 405 406 // StateWorker returns a worker running all the workers that require 407 // a *state.State connection. 408 func (a *MachineAgent) StateWorker() (worker.Worker, error) { 409 agentConfig := a.CurrentConfig() 410 411 // Create system-identity file 412 if err := agent.WriteSystemIdentityFile(agentConfig); err != nil { 413 return nil, err 414 } 415 416 // Start MondoDB server 417 if err := a.ensureMongoServer(agentConfig); err != nil { 418 return nil, err 419 } 420 st, m, err := openState(agentConfig) 421 if err != nil { 422 return nil, err 423 } 424 reportOpenedState(st) 425 426 singularStateConn := singularStateConn{st.MongoSession(), m} 427 runner := newRunner(connectionIsFatal(st), moreImportant) 428 singularRunner, err := newSingularRunner(runner, singularStateConn) 429 if err != nil { 430 return nil, fmt.Errorf("cannot make singular State Runner: %v", err) 431 } 432 433 // Take advantage of special knowledge here in that we will only ever want 434 // the storage provider on one machine, and that is the "bootstrap" node. 435 providerType := agentConfig.Value(agent.ProviderType) 436 if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId { 437 a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) { 438 // TODO(axw) 2013-09-24 bug #1229507 439 // Make another job to enable storage. 440 // There's nothing special about this. 441 return localstorage.NewWorker(agentConfig), nil 442 }) 443 } 444 for _, job := range m.Jobs() { 445 switch job { 446 case state.JobHostUnits: 447 // Implemented in APIWorker. 448 case state.JobManageEnviron: 449 useMultipleCPUs() 450 a.startWorkerAfterUpgrade(runner, "instancepoller", func() (worker.Worker, error) { 451 return instancepoller.NewWorker(st), nil 452 }) 453 if shouldEnableHA(agentConfig) { 454 a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) { 455 return peergrouperNew(st) 456 }) 457 } 458 runner.StartWorker("apiserver", func() (worker.Worker, error) { 459 // If the configuration does not have the required information, 460 // it is currently not a recoverable error, so we kill the whole 461 // agent, potentially enabling human intervention to fix 462 // the agent's configuration file. In the future, we may retrieve 463 // the state server certificate and key from the state, and 464 // this should then change. 465 info, ok := agentConfig.StateServingInfo() 466 if !ok { 467 return nil, &fatalError{"StateServingInfo not available and we need it"} 468 } 469 port := info.APIPort 470 cert := []byte(info.Cert) 471 key := []byte(info.PrivateKey) 472 473 if len(cert) == 0 || len(key) == 0 { 474 return nil, &fatalError{"configuration does not have state server cert/key"} 475 } 476 dataDir := agentConfig.DataDir() 477 logDir := agentConfig.LogDir() 478 return apiserver.NewServer( 479 st, fmt.Sprintf(":%d", port), cert, key, dataDir, logDir) 480 }) 481 a.startWorkerAfterUpgrade(singularRunner, "cleaner", func() (worker.Worker, error) { 482 return cleaner.NewCleaner(st), nil 483 }) 484 a.startWorkerAfterUpgrade(singularRunner, "resumer", func() (worker.Worker, error) { 485 // The action of resumer is so subtle that it is not tested, 486 // because we can't figure out how to do so without brutalising 487 // the transaction log. 488 return resumer.NewResumer(st), nil 489 }) 490 a.startWorkerAfterUpgrade(singularRunner, "minunitsworker", func() (worker.Worker, error) { 491 return minunitsworker.NewMinUnitsWorker(st), nil 492 }) 493 case state.JobManageStateDeprecated: 494 // Legacy environments may set this, but we ignore it. 495 default: 496 logger.Warningf("ignoring unknown job %q", job) 497 } 498 } 499 return newCloseWorker(runner, st), nil 500 } 501 502 // ensureMongoServer ensures that mongo is installed and running, 503 // and ready for opening a state connection. 504 func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) error { 505 servingInfo, ok := agentConfig.StateServingInfo() 506 if !ok { 507 return fmt.Errorf("state worker was started with no state serving info") 508 } 509 namespace := agentConfig.Value(agent.Namespace) 510 withHA := shouldEnableHA(agentConfig) 511 512 // When upgrading from a pre-HA-capable environment, 513 // we must add machine-0 to the admin database and 514 // initiate its replicaset. 515 // 516 // TODO(axw) remove this when we no longer need 517 // to upgrade from pre-HA-capable environments. 518 var shouldInitiateMongoServer bool 519 var addrs []instance.Address 520 if isPreHAVersion(agentConfig.UpgradedToVersion()) { 521 _, err := a.ensureMongoAdminUser(agentConfig) 522 if err != nil { 523 return err 524 } 525 if servingInfo.SharedSecret == "" { 526 servingInfo.SharedSecret, err = mongo.GenerateSharedSecret() 527 if err != nil { 528 return err 529 } 530 if err = a.ChangeConfig(func(config agent.ConfigSetter) { 531 config.SetStateServingInfo(servingInfo) 532 }); err != nil { 533 return err 534 } 535 agentConfig = a.CurrentConfig() 536 } 537 st, m, err := openState(agentConfig) 538 if err != nil { 539 return err 540 } 541 if err := st.SetStateServingInfo(servingInfo); err != nil { 542 st.Close() 543 return fmt.Errorf("cannot set state serving info: %v", err) 544 } 545 st.Close() 546 addrs = m.Addresses() 547 shouldInitiateMongoServer = withHA 548 } 549 550 // ensureMongoServer installs/upgrades the upstart config as necessary. 551 if err := ensureMongoServer( 552 agentConfig.DataDir(), 553 namespace, 554 servingInfo, 555 withHA, 556 ); err != nil { 557 return err 558 } 559 if !shouldInitiateMongoServer { 560 return nil 561 } 562 563 // Initiate the replicaset for upgraded environments. 564 // 565 // TODO(axw) remove this when we no longer need 566 // to upgrade from pre-HA-capable environments. 567 stateInfo, ok := agentConfig.StateInfo() 568 if !ok { 569 return fmt.Errorf("state worker was started with no state serving info") 570 } 571 dialInfo, err := state.DialInfo(stateInfo, state.DefaultDialOpts()) 572 if err != nil { 573 return err 574 } 575 peerAddr := mongo.SelectPeerAddress(addrs) 576 if peerAddr == "" { 577 return fmt.Errorf("no appropriate peer address found in %q", addrs) 578 } 579 return maybeInitiateMongoServer(peergrouper.InitiateMongoParams{ 580 DialInfo: dialInfo, 581 MemberHostPort: net.JoinHostPort(peerAddr, fmt.Sprint(servingInfo.StatePort)), 582 User: stateInfo.Tag, 583 Password: stateInfo.Password, 584 }) 585 } 586 587 func (a *MachineAgent) ensureMongoAdminUser(agentConfig agent.Config) (added bool, err error) { 588 stateInfo, ok1 := agentConfig.StateInfo() 589 servingInfo, ok2 := agentConfig.StateServingInfo() 590 if !ok1 || !ok2 { 591 return false, fmt.Errorf("no state serving info configuration") 592 } 593 dialInfo, err := state.DialInfo(stateInfo, state.DefaultDialOpts()) 594 if err != nil { 595 return false, err 596 } 597 if len(dialInfo.Addrs) > 1 { 598 logger.Infof("more than one state server; admin user must exist") 599 return false, nil 600 } 601 return ensureMongoAdminUser(mongo.EnsureAdminUserParams{ 602 DialInfo: dialInfo, 603 Namespace: agentConfig.Value(agent.Namespace), 604 DataDir: agentConfig.DataDir(), 605 Port: servingInfo.StatePort, 606 User: stateInfo.Tag, 607 Password: stateInfo.Password, 608 }) 609 } 610 611 func isPreHAVersion(v version.Number) bool { 612 return v.Compare(version.MustParse("1.19.0")) < 0 613 } 614 615 // shouldEnableHA reports whether HA should be enabled. 616 // 617 // Eventually this should always be true, and ideally 618 // it should be true before 1.20 is released or we'll 619 // have more upgrade scenarios on our hands. 620 func shouldEnableHA(agentConfig agent.Config) bool { 621 providerType := agentConfig.Value(agent.ProviderType) 622 return providerType != provider.Local 623 } 624 625 func openState(agentConfig agent.Config) (_ *state.State, _ *state.Machine, err error) { 626 info, ok := agentConfig.StateInfo() 627 if !ok { 628 return nil, nil, fmt.Errorf("no state info available") 629 } 630 st, err := state.Open(info, state.DialOpts{}, environs.NewStatePolicy()) 631 if err != nil { 632 return nil, nil, err 633 } 634 defer func() { 635 if err != nil { 636 st.Close() 637 } 638 }() 639 m0, err := st.FindEntity(agentConfig.Tag()) 640 if err != nil { 641 if errors.IsNotFound(err) { 642 err = worker.ErrTerminateAgent 643 } 644 return nil, nil, err 645 } 646 m := m0.(*state.Machine) 647 if m.Life() == state.Dead { 648 return nil, nil, worker.ErrTerminateAgent 649 } 650 // Check the machine nonce as provisioned matches the agent.Conf value. 651 if !m.CheckProvisioned(agentConfig.Nonce()) { 652 // The agent is running on a different machine to the one it 653 // should be according to state. It must stop immediately. 654 logger.Errorf("running machine %v agent on inappropriate instance", m) 655 return nil, nil, worker.ErrTerminateAgent 656 } 657 return st, m, nil 658 } 659 660 // startWorkerAfterUpgrade starts a worker to run the specified child worker 661 // but only after waiting for upgrades to complete. 662 func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) { 663 runner.StartWorker(name, func() (worker.Worker, error) { 664 return a.upgradeWaiterWorker(start), nil 665 }) 666 } 667 668 // upgradeWaiterWorker runs the specified worker after upgrades have completed. 669 func (a *MachineAgent) upgradeWaiterWorker(start func() (worker.Worker, error)) worker.Worker { 670 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 671 // wait for the upgrade to complete (or for us to be stopped) 672 select { 673 case <-stop: 674 return nil 675 case <-a.upgradeComplete: 676 } 677 w, err := start() 678 if err != nil { 679 return err 680 } 681 waitCh := make(chan error) 682 go func() { 683 waitCh <- w.Wait() 684 }() 685 select { 686 case err := <-waitCh: 687 return err 688 case <-stop: 689 w.Kill() 690 } 691 return <-waitCh 692 }) 693 } 694 695 // upgradeWorker runs the required upgrade operations to upgrade to the current Juju version. 696 func (a *MachineAgent) upgradeWorker( 697 apiState *api.State, 698 jobs []params.MachineJob, 699 agentConfig agent.Config, 700 ) worker.Worker { 701 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 702 select { 703 case <-a.upgradeComplete: 704 // Our work is already done (we're probably being restarted 705 // because the API connection has gone down), so do nothing. 706 <-stop 707 return nil 708 default: 709 } 710 // If the machine agent is a state server, flag that state 711 // needs to be opened before running upgrade steps 712 needsState := false 713 for _, job := range jobs { 714 if job == params.JobManageEnviron { 715 needsState = true 716 } 717 } 718 // We need a *state.State for upgrades. We open it independently 719 // of StateWorker, because we have no guarantees about when 720 // and how often StateWorker might run. 721 var st *state.State 722 if needsState { 723 var err error 724 info, ok := agentConfig.StateInfo() 725 if !ok { 726 return fmt.Errorf("no state info available") 727 } 728 st, err = state.Open(info, state.DialOpts{}, environs.NewStatePolicy()) 729 if err != nil { 730 return err 731 } 732 defer st.Close() 733 } 734 err := a.runUpgrades(st, apiState, jobs, agentConfig) 735 if err != nil { 736 return err 737 } 738 logger.Infof("upgrade to %v completed.", version.Current) 739 close(a.upgradeComplete) 740 <-stop 741 return nil 742 }) 743 } 744 745 // runUpgrades runs the upgrade operations for each job type and updates the updatedToVersion on success. 746 func (a *MachineAgent) runUpgrades( 747 st *state.State, 748 apiState *api.State, 749 jobs []params.MachineJob, 750 agentConfig agent.Config, 751 ) error { 752 from := version.Current 753 from.Number = agentConfig.UpgradedToVersion() 754 if from == version.Current { 755 logger.Infof("upgrade to %v already completed.", version.Current) 756 return nil 757 } 758 var err error 759 writeErr := a.ChangeConfig(func(agentConfig agent.ConfigSetter) { 760 context := upgrades.NewContext(agentConfig, apiState, st) 761 for _, job := range jobs { 762 target := upgradeTarget(job) 763 if target == "" { 764 continue 765 } 766 logger.Infof("starting upgrade from %v to %v for %v %q", from, version.Current, target, a.Tag()) 767 if err = upgrades.PerformUpgrade(from.Number, target, context); err != nil { 768 err = fmt.Errorf("cannot perform upgrade from %v to %v for %v %q: %v", from, version.Current, target, a.Tag(), err) 769 return 770 } 771 } 772 agentConfig.SetUpgradedToVersion(version.Current.Number) 773 }) 774 if writeErr != nil { 775 return fmt.Errorf("cannot write updated agent configuration: %v", writeErr) 776 } 777 return nil 778 } 779 780 func upgradeTarget(job params.MachineJob) upgrades.Target { 781 switch job { 782 case params.JobManageEnviron: 783 return upgrades.StateServer 784 case params.JobHostUnits: 785 return upgrades.HostMachine 786 } 787 return "" 788 } 789 790 // WorkersStarted returns a channel that's closed once all top level workers 791 // have been started. This is provided for testing purposes. 792 func (a *MachineAgent) WorkersStarted() <-chan struct{} { 793 return a.workersStarted 794 795 } 796 797 func (a *MachineAgent) Tag() string { 798 return names.MachineTag(a.MachineId) 799 } 800 801 func (a *MachineAgent) createJujuRun(dataDir string) error { 802 // TODO do not remove the symlink if it already points 803 // to the right place. 804 if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) { 805 return err 806 } 807 jujud := filepath.Join(dataDir, "tools", a.Tag(), "jujud") 808 return os.Symlink(jujud, jujuRun) 809 } 810 811 func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error { 812 var errors []error 813 agentServiceName := agentConfig.Value(agent.AgentServiceName) 814 if agentServiceName == "" { 815 // For backwards compatibility, handle lack of AgentServiceName. 816 agentServiceName = os.Getenv("UPSTART_JOB") 817 } 818 if agentServiceName != "" { 819 if err := upstart.NewService(agentServiceName).Remove(); err != nil { 820 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 821 } 822 } 823 // Remove the juju-run symlink. 824 if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) { 825 errors = append(errors, err) 826 } 827 828 namespace := agentConfig.Value(agent.Namespace) 829 if err := mongo.RemoveService(namespace); err != nil { 830 errors = append(errors, fmt.Errorf("cannot stop/remove mongo service with namespace %q: %v", namespace, err)) 831 } 832 if err := os.RemoveAll(agentConfig.DataDir()); err != nil { 833 errors = append(errors, err) 834 } 835 if len(errors) == 0 { 836 return nil 837 } 838 return fmt.Errorf("uninstall failed: %v", errors) 839 } 840 841 // singularAPIConn implements singular.Conn on 842 // top of an API connection. 843 type singularAPIConn struct { 844 apiState *api.State 845 agentState *apiagent.State 846 } 847 848 func (c singularAPIConn) IsMaster() (bool, error) { 849 return c.agentState.IsMaster() 850 } 851 852 func (c singularAPIConn) Ping() error { 853 return c.apiState.Ping() 854 } 855 856 // singularStateConn implements singular.Conn on 857 // top of a State connection. 858 type singularStateConn struct { 859 session *mgo.Session 860 machine *state.Machine 861 } 862 863 func (c singularStateConn) IsMaster() (bool, error) { 864 return mongo.IsMaster(c.session, c.machine) 865 } 866 867 func (c singularStateConn) Ping() error { 868 return c.session.Ping() 869 }