github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/cmd/jujud/agent/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package agent 5 6 import ( 7 "fmt" 8 "io" 9 "net" 10 "os" 11 "path/filepath" 12 "runtime" 13 "strconv" 14 "sync" 15 "time" 16 17 "github.com/juju/cmd" 18 "github.com/juju/errors" 19 "github.com/juju/loggo" 20 "github.com/juju/names" 21 "github.com/juju/replicaset" 22 "github.com/juju/utils" 23 "github.com/juju/utils/clock" 24 "github.com/juju/utils/featureflag" 25 "github.com/juju/utils/series" 26 "github.com/juju/utils/set" 27 "github.com/juju/utils/symlink" 28 "github.com/juju/utils/voyeur" 29 "gopkg.in/juju/charmrepo.v1" 30 "gopkg.in/mgo.v2" 31 "gopkg.in/natefinch/lumberjack.v2" 32 "launchpad.net/gnuflag" 33 "launchpad.net/tomb" 34 35 "github.com/juju/juju/agent" 36 "github.com/juju/juju/api" 37 apiagent "github.com/juju/juju/api/agent" 38 apideployer "github.com/juju/juju/api/deployer" 39 "github.com/juju/juju/api/metricsmanager" 40 apiupgrader "github.com/juju/juju/api/upgrader" 41 "github.com/juju/juju/apiserver" 42 "github.com/juju/juju/apiserver/params" 43 "github.com/juju/juju/cert" 44 "github.com/juju/juju/cmd/jujud/reboot" 45 cmdutil "github.com/juju/juju/cmd/jujud/util" 46 "github.com/juju/juju/container" 47 "github.com/juju/juju/container/kvm" 48 "github.com/juju/juju/container/lxc" 49 "github.com/juju/juju/container/lxc/lxcutils" 50 "github.com/juju/juju/environs" 51 "github.com/juju/juju/environs/config" 52 "github.com/juju/juju/feature" 53 "github.com/juju/juju/instance" 54 jujunames "github.com/juju/juju/juju/names" 55 "github.com/juju/juju/juju/paths" 56 "github.com/juju/juju/mongo" 57 "github.com/juju/juju/network" 58 "github.com/juju/juju/provider" 59 "github.com/juju/juju/service" 60 "github.com/juju/juju/service/common" 61 "github.com/juju/juju/state" 62 "github.com/juju/juju/state/multiwatcher" 63 statestorage "github.com/juju/juju/state/storage" 64 "github.com/juju/juju/storage/looputil" 65 "github.com/juju/juju/version" 66 "github.com/juju/juju/worker" 67 "github.com/juju/juju/worker/addresser" 68 "github.com/juju/juju/worker/apiaddressupdater" 69 "github.com/juju/juju/worker/apicaller" 70 "github.com/juju/juju/worker/authenticationworker" 71 "github.com/juju/juju/worker/certupdater" 72 "github.com/juju/juju/worker/charmrevisionworker" 73 "github.com/juju/juju/worker/cleaner" 74 "github.com/juju/juju/worker/conv2state" 75 "github.com/juju/juju/worker/dblogpruner" 76 "github.com/juju/juju/worker/deployer" 77 "github.com/juju/juju/worker/diskmanager" 78 "github.com/juju/juju/worker/envworkermanager" 79 "github.com/juju/juju/worker/firewaller" 80 "github.com/juju/juju/worker/gate" 81 "github.com/juju/juju/worker/imagemetadataworker" 82 "github.com/juju/juju/worker/instancepoller" 83 "github.com/juju/juju/worker/localstorage" 84 workerlogger "github.com/juju/juju/worker/logger" 85 "github.com/juju/juju/worker/logsender" 86 "github.com/juju/juju/worker/machiner" 87 "github.com/juju/juju/worker/metricworker" 88 "github.com/juju/juju/worker/minunitsworker" 89 "github.com/juju/juju/worker/networker" 90 "github.com/juju/juju/worker/peergrouper" 91 "github.com/juju/juju/worker/provisioner" 92 "github.com/juju/juju/worker/proxyupdater" 93 rebootworker "github.com/juju/juju/worker/reboot" 94 "github.com/juju/juju/worker/resumer" 95 "github.com/juju/juju/worker/rsyslog" 96 "github.com/juju/juju/worker/singular" 97 "github.com/juju/juju/worker/statushistorypruner" 98 "github.com/juju/juju/worker/storageprovisioner" 99 "github.com/juju/juju/worker/terminationworker" 100 "github.com/juju/juju/worker/toolsversionchecker" 101 "github.com/juju/juju/worker/txnpruner" 102 "github.com/juju/juju/worker/upgrader" 103 ) 104 105 const bootstrapMachineId = "0" 106 107 var ( 108 logger = loggo.GetLogger("juju.cmd.jujud") 109 retryDelay = 3 * time.Second 110 JujuRun = paths.MustSucceed(paths.JujuRun(series.HostSeries())) 111 112 // The following are defined as variables to allow the tests to 113 // intercept calls to the functions. 114 useMultipleCPUs = utils.UseMultipleCPUs 115 maybeInitiateMongoServer = peergrouper.MaybeInitiateMongoServer 116 ensureMongoAdminUser = mongo.EnsureAdminUser 117 newSingularRunner = singular.New 118 peergrouperNew = peergrouper.New 119 newMachiner = machiner.NewMachiner 120 newNetworker = networker.NewNetworker 121 newFirewaller = firewaller.NewFirewaller 122 newDiskManager = diskmanager.NewWorker 123 newStorageWorker = storageprovisioner.NewStorageProvisioner 124 newCertificateUpdater = certupdater.NewCertificateUpdater 125 newResumer = resumer.NewResumer 126 newInstancePoller = instancepoller.NewWorker 127 newCleaner = cleaner.NewCleaner 128 newAddresser = addresser.NewWorker 129 newMetadataUpdater = imagemetadataworker.NewWorker 130 reportOpenedState = func(io.Closer) {} 131 reportOpenedAPI = func(io.Closer) {} 132 getMetricAPI = metricAPI 133 ) 134 135 // Variable to override in tests, default is true 136 var ProductionMongoWriteConcern = true 137 138 func init() { 139 stateWorkerDialOpts = mongo.DefaultDialOpts() 140 stateWorkerDialOpts.PostDial = func(session *mgo.Session) error { 141 safe := mgo.Safe{} 142 if ProductionMongoWriteConcern { 143 safe.J = true 144 _, err := replicaset.CurrentConfig(session) 145 if err == nil { 146 // set mongo to write-majority (writes only returned after 147 // replicated to a majority of replica-set members). 148 safe.WMode = "majority" 149 } 150 } 151 session.SetSafe(&safe) 152 return nil 153 } 154 } 155 156 // AgentInitializer handles initializing a type for use as a Jujud 157 // agent. 158 type AgentInitializer interface { 159 AddFlags(*gnuflag.FlagSet) 160 CheckArgs([]string) error 161 } 162 163 // AgentConfigWriter encapsulates disk I/O operations with the agent 164 // config. 165 type AgentConfigWriter interface { 166 // ReadConfig reads the config for the given tag from disk. 167 ReadConfig(tag string) error 168 // ChangeConfig executes the given agent.ConfigMutator in a 169 // thread-safe context. 170 ChangeConfig(agent.ConfigMutator) error 171 // CurrentConfig returns a copy of the in-memory agent config. 172 CurrentConfig() agent.Config 173 } 174 175 // NewMachineAgentCmd creates a Command which handles parsing 176 // command-line arguments and instantiating and running a 177 // MachineAgent. 178 func NewMachineAgentCmd( 179 ctx *cmd.Context, 180 machineAgentFactory func(string) *MachineAgent, 181 agentInitializer AgentInitializer, 182 configFetcher AgentConfigWriter, 183 ) cmd.Command { 184 return &machineAgentCmd{ 185 ctx: ctx, 186 machineAgentFactory: machineAgentFactory, 187 agentInitializer: agentInitializer, 188 currentConfig: configFetcher, 189 } 190 } 191 192 type machineAgentCmd struct { 193 cmd.CommandBase 194 195 // This group of arguments is required. 196 agentInitializer AgentInitializer 197 currentConfig AgentConfigWriter 198 machineAgentFactory func(string) *MachineAgent 199 ctx *cmd.Context 200 201 // This group is for debugging purposes. 202 logToStdErr bool 203 204 // The following are set via command-line flags. 205 machineId string 206 } 207 208 // Init is called by the cmd system to initialize the structure for 209 // running. 210 func (a *machineAgentCmd) Init(args []string) error { 211 212 if !names.IsValidMachine(a.machineId) { 213 return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer") 214 } 215 if err := a.agentInitializer.CheckArgs(args); err != nil { 216 return err 217 } 218 219 // Due to changes in the logging, and needing to care about old 220 // environments that have been upgraded, we need to explicitly remove the 221 // file writer if one has been added, otherwise we will get duplicate 222 // lines of all logging in the log file. 223 loggo.RemoveWriter("logfile") 224 225 if a.logToStdErr { 226 return nil 227 } 228 229 err := a.currentConfig.ReadConfig(names.NewMachineTag(a.machineId).String()) 230 if err != nil { 231 return errors.Annotate(err, "cannot read agent configuration") 232 } 233 agentConfig := a.currentConfig.CurrentConfig() 234 235 // the context's stderr is set as the loggo writer in github.com/juju/cmd/logging.go 236 a.ctx.Stderr = &lumberjack.Logger{ 237 Filename: agent.LogFilename(agentConfig), 238 MaxSize: 300, // megabytes 239 MaxBackups: 2, 240 } 241 242 return nil 243 } 244 245 // Run instantiates a MachineAgent and runs it. 246 func (a *machineAgentCmd) Run(c *cmd.Context) error { 247 machineAgent := a.machineAgentFactory(a.machineId) 248 return machineAgent.Run(c) 249 } 250 251 // SetFlags adds the requisite flags to run this command. 252 func (a *machineAgentCmd) SetFlags(f *gnuflag.FlagSet) { 253 a.agentInitializer.AddFlags(f) 254 f.StringVar(&a.machineId, "machine-id", "", "id of the machine to run") 255 } 256 257 // Info returns usage information for the command. 258 func (a *machineAgentCmd) Info() *cmd.Info { 259 return &cmd.Info{ 260 Name: "machine", 261 Purpose: "run a juju machine agent", 262 } 263 } 264 265 // MachineAgentFactoryFn returns a function which instantiates a 266 // MachineAgent given a machineId. 267 func MachineAgentFactoryFn( 268 agentConfWriter AgentConfigWriter, 269 bufferedLogs logsender.LogRecordCh, 270 loopDeviceManager looputil.LoopDeviceManager, 271 ) func(string) *MachineAgent { 272 return func(machineId string) *MachineAgent { 273 return NewMachineAgent( 274 machineId, 275 agentConfWriter, 276 bufferedLogs, 277 NewUpgradeWorkerContext(), 278 worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant), 279 loopDeviceManager, 280 ) 281 } 282 } 283 284 // NewMachineAgent instantiates a new MachineAgent. 285 func NewMachineAgent( 286 machineId string, 287 agentConfWriter AgentConfigWriter, 288 bufferedLogs logsender.LogRecordCh, 289 upgradeWorkerContext *upgradeWorkerContext, 290 runner worker.Runner, 291 loopDeviceManager looputil.LoopDeviceManager, 292 ) *MachineAgent { 293 return &MachineAgent{ 294 machineId: machineId, 295 AgentConfigWriter: agentConfWriter, 296 bufferedLogs: bufferedLogs, 297 upgradeWorkerContext: upgradeWorkerContext, 298 workersStarted: make(chan struct{}), 299 runner: runner, 300 initialAgentUpgradeCheckComplete: make(chan struct{}), 301 loopDeviceManager: loopDeviceManager, 302 } 303 } 304 305 // MachineAgent is responsible for tying together all functionality 306 // needed to orchestrate a Jujud instance which controls a machine. 307 type MachineAgent struct { 308 AgentConfigWriter 309 310 tomb tomb.Tomb 311 machineId string 312 previousAgentVersion version.Number 313 runner worker.Runner 314 bufferedLogs logsender.LogRecordCh 315 configChangedVal voyeur.Value 316 upgradeWorkerContext *upgradeWorkerContext 317 workersStarted chan struct{} 318 319 // XXX(fwereade): these smell strongly of goroutine-unsafeness. 320 restoreMode bool 321 restoring bool 322 323 // Used to signal that the upgrade worker will not 324 // reboot the agent on startup because there are no 325 // longer any immediately pending agent upgrades. 326 // Channel used as a selectable bool (closed means true). 327 initialAgentUpgradeCheckComplete chan struct{} 328 329 mongoInitMutex sync.Mutex 330 mongoInitialized bool 331 332 loopDeviceManager looputil.LoopDeviceManager 333 } 334 335 // IsRestorePreparing returns bool representing if we are in restore mode 336 // but not running restore. 337 func (a *MachineAgent) IsRestorePreparing() bool { 338 return a.restoreMode && !a.restoring 339 } 340 341 // IsRestoreRunning returns bool representing if we are in restore mode 342 // and running the actual restore process. 343 func (a *MachineAgent) IsRestoreRunning() bool { 344 return a.restoring 345 } 346 347 func (a *MachineAgent) isAgentUpgradePending() bool { 348 select { 349 case <-a.initialAgentUpgradeCheckComplete: 350 return false 351 default: 352 return true 353 } 354 } 355 356 // Wait waits for the machine agent to finish. 357 func (a *MachineAgent) Wait() error { 358 return a.tomb.Wait() 359 } 360 361 // Stop stops the machine agent. 362 func (a *MachineAgent) Stop() error { 363 a.runner.Kill() 364 return a.tomb.Wait() 365 } 366 367 // Dying returns the channel that can be used to see if the machine 368 // agent is terminating. 369 func (a *MachineAgent) Dying() <-chan struct{} { 370 return a.tomb.Dying() 371 } 372 373 // upgradeCertificateDNSNames ensure that the state server certificate 374 // recorded in the agent config and also mongo server.pem contains the 375 // DNSNames entires required by Juju/ 376 func (a *MachineAgent) upgradeCertificateDNSNames() error { 377 agentConfig := a.CurrentConfig() 378 si, ok := agentConfig.StateServingInfo() 379 if !ok || si.CAPrivateKey == "" { 380 // No certificate information exists yet, nothing to do. 381 return nil 382 } 383 // Parse the current certificate to get the current dns names. 384 serverCert, err := cert.ParseCert(si.Cert) 385 if err != nil { 386 return err 387 } 388 update := false 389 dnsNames := set.NewStrings(serverCert.DNSNames...) 390 requiredDNSNames := []string{"local", "juju-apiserver", "juju-mongodb"} 391 for _, dnsName := range requiredDNSNames { 392 if dnsNames.Contains(dnsName) { 393 continue 394 } 395 dnsNames.Add(dnsName) 396 update = true 397 } 398 if !update { 399 return nil 400 } 401 // Write a new certificate to the mongo pem and agent config files. 402 si.Cert, si.PrivateKey, err = cert.NewDefaultServer(agentConfig.CACert(), si.CAPrivateKey, dnsNames.Values()) 403 if err != nil { 404 return err 405 } 406 if err := mongo.UpdateSSLKey(agentConfig.DataDir(), si.Cert, si.PrivateKey); err != nil { 407 return err 408 } 409 return a.AgentConfigWriter.ChangeConfig(func(config agent.ConfigSetter) error { 410 config.SetStateServingInfo(si) 411 return nil 412 }) 413 } 414 415 // Run runs a machine agent. 416 func (a *MachineAgent) Run(*cmd.Context) error { 417 418 defer a.tomb.Done() 419 if err := a.ReadConfig(a.Tag().String()); err != nil { 420 return fmt.Errorf("cannot read agent configuration: %v", err) 421 } 422 423 logger.Infof("machine agent %v start (%s [%s])", a.Tag(), version.Current, runtime.Compiler) 424 if flags := featureflag.String(); flags != "" { 425 logger.Warningf("developer feature flags enabled: %s", flags) 426 } 427 428 // Before doing anything else, we need to make sure the certificate generated for 429 // use by mongo to validate state server connections is correct. This needs to be done 430 // before any possible restart of the mongo service. 431 // See bug http://pad.lv/1434680 432 if err := a.upgradeCertificateDNSNames(); err != nil { 433 return errors.Annotate(err, "error upgrading server certificate") 434 } 435 436 agentConfig := a.CurrentConfig() 437 438 if err := a.upgradeWorkerContext.InitializeUsingAgent(a); err != nil { 439 return errors.Annotate(err, "error during upgradeWorkerContext initialisation") 440 } 441 a.configChangedVal.Set(struct{}{}) 442 a.previousAgentVersion = agentConfig.UpgradedToVersion() 443 444 network.InitializeFromConfig(agentConfig) 445 charmrepo.CacheDir = filepath.Join(agentConfig.DataDir(), "charmcache") 446 if err := a.createJujuRun(agentConfig.DataDir()); err != nil { 447 return fmt.Errorf("cannot create juju run symlink: %v", err) 448 } 449 a.runner.StartWorker("api", a.APIWorker) 450 a.runner.StartWorker("statestarter", a.newStateStarterWorker) 451 a.runner.StartWorker("termination", func() (worker.Worker, error) { 452 return terminationworker.NewWorker(), nil 453 }) 454 455 // At this point, all workers will have been configured to start 456 close(a.workersStarted) 457 err := a.runner.Wait() 458 switch err { 459 case worker.ErrTerminateAgent: 460 err = a.uninstallAgent(agentConfig) 461 case worker.ErrRebootMachine: 462 logger.Infof("Caught reboot error") 463 err = a.executeRebootOrShutdown(params.ShouldReboot) 464 case worker.ErrShutdownMachine: 465 logger.Infof("Caught shutdown error") 466 err = a.executeRebootOrShutdown(params.ShouldShutdown) 467 } 468 err = cmdutil.AgentDone(logger, err) 469 a.tomb.Kill(err) 470 return err 471 } 472 473 func (a *MachineAgent) executeRebootOrShutdown(action params.RebootAction) error { 474 agentCfg := a.CurrentConfig() 475 // At this stage, all API connections would have been closed 476 // We need to reopen the API to clear the reboot flag after 477 // scheduling the reboot. It may be cleaner to do this in the reboot 478 // worker, before returning the ErrRebootMachine. 479 st, _, err := apicaller.OpenAPIState(a) 480 if err != nil { 481 logger.Infof("Reboot: Error connecting to state") 482 return errors.Trace(err) 483 } 484 // block until all units/containers are ready, and reboot/shutdown 485 finalize, err := reboot.NewRebootWaiter(st, agentCfg) 486 if err != nil { 487 return errors.Trace(err) 488 } 489 490 logger.Infof("Reboot: Executing reboot") 491 err = finalize.ExecuteReboot(action) 492 if err != nil { 493 logger.Infof("Reboot: Error executing reboot: %v", err) 494 return errors.Trace(err) 495 } 496 // On windows, the shutdown command is asynchronous. We return ErrRebootMachine 497 // so the agent will simply exit without error pending reboot/shutdown. 498 return worker.ErrRebootMachine 499 } 500 501 func (a *MachineAgent) ChangeConfig(mutate agent.ConfigMutator) error { 502 err := a.AgentConfigWriter.ChangeConfig(mutate) 503 a.configChangedVal.Set(struct{}{}) 504 if err != nil { 505 return errors.Trace(err) 506 } 507 return nil 508 } 509 510 // PrepareRestore will flag the agent to allow only a limited set 511 // of commands defined in 512 // "github.com/juju/juju/apiserver".allowedMethodsAboutToRestore 513 // the most noteworthy is: 514 // Backups.Restore: this will ensure that we can do all the file movements 515 // required for restore and no one will do changes while we do that. 516 // it will return error if the machine is already in this state. 517 func (a *MachineAgent) PrepareRestore() error { 518 if a.restoreMode { 519 return errors.Errorf("already in restore mode") 520 } 521 a.restoreMode = true 522 return nil 523 } 524 525 // BeginRestore will flag the agent to disallow all commands since 526 // restore should be running and therefore making changes that 527 // would override anything done. 528 func (a *MachineAgent) BeginRestore() error { 529 switch { 530 case !a.restoreMode: 531 return errors.Errorf("not in restore mode, cannot begin restoration") 532 case a.restoring: 533 return errors.Errorf("already restoring") 534 } 535 a.restoring = true 536 return nil 537 } 538 539 // EndRestore will flag the agent to allow all commands 540 // This being invoked means that restore process failed 541 // since success restarts the agent. 542 func (a *MachineAgent) EndRestore() { 543 a.restoreMode = false 544 a.restoring = false 545 } 546 547 // newRestoreStateWatcherWorker will return a worker or err if there 548 // is a failure, the worker takes care of watching the state of 549 // restoreInfo doc and put the agent in the different restore modes. 550 func (a *MachineAgent) newRestoreStateWatcherWorker(st *state.State) (worker.Worker, error) { 551 rWorker := func(stopch <-chan struct{}) error { 552 return a.restoreStateWatcher(st, stopch) 553 } 554 return worker.NewSimpleWorker(rWorker), nil 555 } 556 557 // restoreChanged will be called whenever restoreInfo doc changes signaling a new 558 // step in the restore process. 559 func (a *MachineAgent) restoreChanged(st *state.State) error { 560 rinfo, err := st.RestoreInfoSetter() 561 if err != nil { 562 return errors.Annotate(err, "cannot read restore state") 563 } 564 switch rinfo.Status() { 565 case state.RestorePending: 566 a.PrepareRestore() 567 case state.RestoreInProgress: 568 a.BeginRestore() 569 case state.RestoreFailed: 570 a.EndRestore() 571 } 572 return nil 573 } 574 575 // restoreStateWatcher watches for restoreInfo looking for changes in the restore process. 576 func (a *MachineAgent) restoreStateWatcher(st *state.State, stopch <-chan struct{}) error { 577 restoreWatch := st.WatchRestoreInfoChanges() 578 defer func() { 579 restoreWatch.Kill() 580 restoreWatch.Wait() 581 }() 582 583 for { 584 select { 585 case <-restoreWatch.Changes(): 586 if err := a.restoreChanged(st); err != nil { 587 return err 588 } 589 case <-stopch: 590 return nil 591 } 592 } 593 } 594 595 // newStateStarterWorker wraps stateStarter in a simple worker for use in 596 // a.runner.StartWorker. 597 func (a *MachineAgent) newStateStarterWorker() (worker.Worker, error) { 598 return worker.NewSimpleWorker(a.stateStarter), nil 599 } 600 601 // stateStarter watches for changes to the agent configuration, and 602 // starts or stops the state worker as appropriate. We watch the agent 603 // configuration because the agent configuration has all the details 604 // that we need to start a state server, whether they have been cached 605 // or read from the state. 606 // 607 // It will stop working as soon as stopch is closed. 608 func (a *MachineAgent) stateStarter(stopch <-chan struct{}) error { 609 confWatch := a.configChangedVal.Watch() 610 defer confWatch.Close() 611 watchCh := make(chan struct{}) 612 go func() { 613 for confWatch.Next() { 614 watchCh <- struct{}{} 615 } 616 }() 617 for { 618 select { 619 case <-watchCh: 620 agentConfig := a.CurrentConfig() 621 622 // N.B. StartWorker and StopWorker are idempotent. 623 _, ok := agentConfig.StateServingInfo() 624 if ok { 625 a.runner.StartWorker("state", func() (worker.Worker, error) { 626 return a.StateWorker() 627 }) 628 } else { 629 a.runner.StopWorker("state") 630 } 631 case <-stopch: 632 return nil 633 } 634 } 635 } 636 637 // APIWorker returns a Worker that connects to the API and starts any 638 // workers that need an API connection. 639 func (a *MachineAgent) APIWorker() (_ worker.Worker, err error) { 640 st, entity, err := apicaller.OpenAPIState(a) 641 if err != nil { 642 return nil, err 643 } 644 reportOpenedAPI(st) 645 646 defer func() { 647 // TODO(fwereade): this is not properly tested. Old tests were evil 648 // (dependent on injecting an error in a patched-out upgrader API 649 // that shouldn't even be used at this level)... so I just deleted 650 // them. Not a major worry: this whole method will become redundant 651 // when we switch to the dependency engine (and specifically use 652 // worker/apicaller to connect). 653 if err != nil { 654 if err := st.Close(); err != nil { 655 logger.Errorf("while closing API: %v", err) 656 } 657 } 658 }() 659 660 agentConfig := a.CurrentConfig() 661 for _, job := range entity.Jobs() { 662 if job.NeedsState() { 663 info, err := st.Agent().StateServingInfo() 664 if err != nil { 665 return nil, fmt.Errorf("cannot get state serving info: %v", err) 666 } 667 err = a.ChangeConfig(func(config agent.ConfigSetter) error { 668 config.SetStateServingInfo(info) 669 return nil 670 }) 671 if err != nil { 672 return nil, err 673 } 674 agentConfig = a.CurrentConfig() 675 break 676 } 677 } 678 679 runner := newConnRunner(st) 680 681 // Run the agent upgrader and the upgrade-steps worker without waiting for 682 // the upgrade steps to complete. 683 runner.StartWorker("upgrader", a.agentUpgraderWorkerStarter(st.Upgrader(), agentConfig)) 684 runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter(st, entity.Jobs())) 685 686 // All other workers must wait for the upgrade steps to complete before starting. 687 a.startWorkerAfterUpgrade(runner, "api-post-upgrade", func() (worker.Worker, error) { 688 return a.postUpgradeAPIWorker(st, agentConfig, entity) 689 }) 690 691 return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 692 } 693 694 func (a *MachineAgent) postUpgradeAPIWorker( 695 st api.Connection, 696 agentConfig agent.Config, 697 entity *apiagent.Entity, 698 ) (worker.Worker, error) { 699 700 var isEnvironManager bool 701 for _, job := range entity.Jobs() { 702 if job == multiwatcher.JobManageEnviron { 703 isEnvironManager = true 704 break 705 } 706 } 707 708 runner := newConnRunner(st) 709 710 // TODO(fwereade): this is *still* a hideous layering violation, but at least 711 // it's confined to jujud rather than extending into the worker itself. 712 // Start this worker first to try and get proxy settings in place 713 // before we do anything else. 714 writeSystemFiles := shouldWriteProxyFiles(agentConfig) 715 runner.StartWorker("proxyupdater", func() (worker.Worker, error) { 716 return proxyupdater.New(st.Environment(), writeSystemFiles), nil 717 }) 718 719 if isEnvironManager { 720 runner.StartWorker("resumer", func() (worker.Worker, error) { 721 // The action of resumer is so subtle that it is not tested, 722 // because we can't figure out how to do so without 723 // brutalising the transaction log. 724 return newResumer(st.Resumer()), nil 725 }) 726 } 727 728 if feature.IsDbLogEnabled() { 729 runner.StartWorker("logsender", func() (worker.Worker, error) { 730 return logsender.New(a.bufferedLogs, gate.AlreadyUnlocked{}, a), nil 731 }) 732 } 733 734 envConfig, err := st.Environment().EnvironConfig() 735 if err != nil { 736 return nil, fmt.Errorf("cannot read environment config: %v", err) 737 } 738 ignoreMachineAddresses, _ := envConfig.IgnoreMachineAddresses() 739 if ignoreMachineAddresses { 740 logger.Infof("machine addresses not used, only addresses from provider") 741 } 742 runner.StartWorker("machiner", func() (worker.Worker, error) { 743 accessor := machiner.APIMachineAccessor{st.Machiner()} 744 return newMachiner(accessor, agentConfig, ignoreMachineAddresses), nil 745 }) 746 runner.StartWorker("reboot", func() (worker.Worker, error) { 747 reboot, err := st.Reboot() 748 if err != nil { 749 return nil, errors.Trace(err) 750 } 751 lock, err := cmdutil.HookExecutionLock(cmdutil.DataDir) 752 if err != nil { 753 return nil, errors.Trace(err) 754 } 755 return rebootworker.NewReboot(reboot, agentConfig, lock) 756 }) 757 runner.StartWorker("apiaddressupdater", func() (worker.Worker, error) { 758 addressUpdater := agent.APIHostPortsSetter{a} 759 return apiaddressupdater.NewAPIAddressUpdater(st.Machiner(), addressUpdater), nil 760 }) 761 runner.StartWorker("logger", func() (worker.Worker, error) { 762 return workerlogger.NewLogger(st.Logger(), agentConfig), nil 763 }) 764 765 if !featureflag.Enabled(feature.DisableRsyslog) { 766 rsyslogMode := rsyslog.RsyslogModeForwarding 767 if isEnvironManager { 768 rsyslogMode = rsyslog.RsyslogModeAccumulate 769 } 770 771 runner.StartWorker("rsyslog", func() (worker.Worker, error) { 772 return cmdutil.NewRsyslogConfigWorker(st.Rsyslog(), agentConfig, rsyslogMode) 773 }) 774 } 775 776 if !isEnvironManager { 777 runner.StartWorker("stateconverter", func() (worker.Worker, error) { 778 return worker.NewNotifyWorker(conv2state.New(st.Machiner(), a)), nil 779 }) 780 } 781 782 runner.StartWorker("diskmanager", func() (worker.Worker, error) { 783 api, err := st.DiskManager() 784 if err != nil { 785 return nil, errors.Trace(err) 786 } 787 return newDiskManager(diskmanager.DefaultListBlockDevices, api), nil 788 }) 789 runner.StartWorker("storageprovisioner-machine", func() (worker.Worker, error) { 790 scope := agentConfig.Tag() 791 api := st.StorageProvisioner(scope) 792 storageDir := filepath.Join(agentConfig.DataDir(), "storage") 793 return newStorageWorker( 794 scope, storageDir, api, api, api, api, api, api, 795 clock.WallClock, 796 ), nil 797 }) 798 799 if isEnvironManager { 800 // Start worker that stores missing published image metadata in state. 801 runner.StartWorker("imagemetadata", func() (worker.Worker, error) { 802 return newMetadataUpdater(st.MetadataUpdater()), nil 803 }) 804 } 805 806 // Check if the network management is disabled. 807 disableNetworkManagement, _ := envConfig.DisableNetworkManagement() 808 if disableNetworkManagement { 809 logger.Infof("network management is disabled") 810 } 811 812 // Start networker depending on configuration and job. 813 intrusiveMode := false 814 for _, job := range entity.Jobs() { 815 if job == multiwatcher.JobManageNetworking { 816 intrusiveMode = true 817 break 818 } 819 } 820 intrusiveMode = intrusiveMode && !disableNetworkManagement 821 runner.StartWorker("networker", func() (worker.Worker, error) { 822 return newNetworker(st.Networker(), agentConfig, intrusiveMode, networker.DefaultConfigBaseDir) 823 }) 824 825 // If not a local provider bootstrap machine, start the worker to 826 // manage SSH keys. 827 providerType := agentConfig.Value(agent.ProviderType) 828 if providerType != provider.Local || a.machineId != bootstrapMachineId { 829 runner.StartWorker("authenticationworker", func() (worker.Worker, error) { 830 return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil 831 }) 832 } 833 834 // Perform the operations needed to set up hosting for containers. 835 if err := a.setupContainerSupport(runner, st, entity, agentConfig); err != nil { 836 cause := errors.Cause(err) 837 if params.IsCodeDead(cause) || cause == worker.ErrTerminateAgent { 838 return nil, worker.ErrTerminateAgent 839 } 840 return nil, fmt.Errorf("setting up container support: %v", err) 841 } 842 for _, job := range entity.Jobs() { 843 switch job { 844 case multiwatcher.JobHostUnits: 845 runner.StartWorker("deployer", func() (worker.Worker, error) { 846 apiDeployer := st.Deployer() 847 context := newDeployContext(apiDeployer, agentConfig) 848 return deployer.NewDeployer(apiDeployer, context), nil 849 }) 850 case multiwatcher.JobManageEnviron: 851 runner.StartWorker("identity-file-writer", func() (worker.Worker, error) { 852 inner := func(<-chan struct{}) error { 853 agentConfig := a.CurrentConfig() 854 return agent.WriteSystemIdentityFile(agentConfig) 855 } 856 return worker.NewSimpleWorker(inner), nil 857 }) 858 runner.StartWorker("toolsversionchecker", func() (worker.Worker, error) { 859 // 4 times a day seems a decent enough amount of checks. 860 checkerParams := toolsversionchecker.VersionCheckerParams{ 861 CheckInterval: time.Hour * 6, 862 } 863 return toolsversionchecker.New(st.Environment(), &checkerParams), nil 864 }) 865 866 case multiwatcher.JobManageStateDeprecated: 867 // Legacy environments may set this, but we ignore it. 868 default: 869 // TODO(dimitern): Once all workers moved over to using 870 // the API, report "unknown job type" here. 871 } 872 } 873 874 return cmdutil.NewCloseWorker(logger, runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 875 } 876 877 // Restart restarts the agent's service. 878 func (a *MachineAgent) Restart() error { 879 name := a.CurrentConfig().Value(agent.AgentServiceName) 880 return service.Restart(name) 881 } 882 883 func (a *MachineAgent) upgradeStepsWorkerStarter( 884 st api.Connection, 885 jobs []multiwatcher.MachineJob, 886 ) func() (worker.Worker, error) { 887 return func() (worker.Worker, error) { 888 return a.upgradeWorkerContext.Worker(a, st, jobs), nil 889 } 890 } 891 892 func (a *MachineAgent) agentUpgraderWorkerStarter( 893 st *apiupgrader.State, 894 agentConfig agent.Config, 895 ) func() (worker.Worker, error) { 896 return func() (worker.Worker, error) { 897 return upgrader.NewAgentUpgrader( 898 st, 899 agentConfig, 900 a.previousAgentVersion, 901 a.upgradeWorkerContext.IsUpgradeRunning, 902 a.initialAgentUpgradeCheckComplete, 903 ), nil 904 } 905 } 906 907 // shouldWriteProxyFiles returns true, unless the supplied conf identifies the 908 // machine agent running directly on the host system in a local environment. 909 var shouldWriteProxyFiles = func(conf agent.Config) bool { 910 if conf.Value(agent.ProviderType) != provider.Local { 911 return true 912 } 913 return conf.Tag() != names.NewMachineTag(bootstrapMachineId) 914 } 915 916 // setupContainerSupport determines what containers can be run on this machine and 917 // initialises suitable infrastructure to support such containers. 918 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st api.Connection, entity *apiagent.Entity, agentConfig agent.Config) error { 919 var supportedContainers []instance.ContainerType 920 // LXC containers are only supported on bare metal and fully virtualized linux systems 921 // Nested LXC containers and Windows machines cannot run LXC containers 922 supportsLXC, err := lxc.IsLXCSupported() 923 if err != nil { 924 logger.Warningf("no lxc containers possible: %v", err) 925 } 926 if err == nil && supportsLXC { 927 supportedContainers = append(supportedContainers, instance.LXC) 928 } 929 930 supportsKvm, err := kvm.IsKVMSupported() 931 if err != nil { 932 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 933 } 934 if err == nil && supportsKvm { 935 supportedContainers = append(supportedContainers, instance.KVM) 936 } 937 return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers, agentConfig) 938 } 939 940 // updateSupportedContainers records in state that a machine can run the specified containers. 941 // It starts a watcher and when a container of a given type is first added to the machine, 942 // the watcher is killed, the machine is set up to be able to start containers of the given type, 943 // and a suitable provisioner is started. 944 func (a *MachineAgent) updateSupportedContainers( 945 runner worker.Runner, 946 st api.Connection, 947 machineTag string, 948 containers []instance.ContainerType, 949 agentConfig agent.Config, 950 ) error { 951 pr := st.Provisioner() 952 tag, err := names.ParseMachineTag(machineTag) 953 if err != nil { 954 return err 955 } 956 machine, err := pr.Machine(tag) 957 if errors.IsNotFound(err) || err == nil && machine.Life() == params.Dead { 958 return worker.ErrTerminateAgent 959 } 960 if err != nil { 961 return errors.Annotatef(err, "cannot load machine %s from state", tag) 962 } 963 if len(containers) == 0 { 964 if err := machine.SupportsNoContainers(); err != nil { 965 return errors.Annotatef(err, "clearing supported containers for %s", tag) 966 } 967 return nil 968 } 969 if err := machine.SetSupportedContainers(containers...); err != nil { 970 return errors.Annotatef(err, "setting supported containers for %s", tag) 971 } 972 initLock, err := cmdutil.HookExecutionLock(agentConfig.DataDir()) 973 if err != nil { 974 return err 975 } 976 // Start the watcher to fire when a container is first requested on the machine. 977 envUUID, err := st.EnvironTag() 978 if err != nil { 979 return err 980 } 981 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 982 // There may not be a CA certificate private key available, and without 983 // it we can't ensure that other Juju nodes can connect securely, so only 984 // use an image URL getter if there's a private key. 985 var imageURLGetter container.ImageURLGetter 986 if agentConfig.Value(agent.AllowsSecureConnection) == "true" { 987 imageURLGetter = container.NewImageURLGetter(st.Addr(), envUUID.Id(), []byte(agentConfig.CACert())) 988 } 989 params := provisioner.ContainerSetupParams{ 990 Runner: runner, 991 WorkerName: watcherName, 992 SupportedContainers: containers, 993 ImageURLGetter: imageURLGetter, 994 Machine: machine, 995 Provisioner: pr, 996 Config: agentConfig, 997 InitLock: initLock, 998 } 999 handler := provisioner.NewContainerSetupHandler(params) 1000 a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) { 1001 return worker.NewStringsWorker(handler), nil 1002 }) 1003 return nil 1004 } 1005 1006 // StateWorker returns a worker running all the workers that require 1007 // a *state.State connection. 1008 func (a *MachineAgent) StateWorker() (worker.Worker, error) { 1009 agentConfig := a.CurrentConfig() 1010 1011 // Start MongoDB server and dial. 1012 if err := a.ensureMongoServer(agentConfig); err != nil { 1013 return nil, err 1014 } 1015 st, m, err := openState(agentConfig, stateWorkerDialOpts) 1016 if err != nil { 1017 return nil, err 1018 } 1019 reportOpenedState(st) 1020 1021 stor := statestorage.NewStorage(st.EnvironUUID(), st.MongoSession()) 1022 registerSimplestreamsDataSource(stor) 1023 1024 runner := newConnRunner(st) 1025 singularRunner, err := newSingularStateRunner(runner, st, m) 1026 if err != nil { 1027 return nil, errors.Trace(err) 1028 } 1029 1030 // Take advantage of special knowledge here in that we will only ever want 1031 // the storage provider on one machine, and that is the "bootstrap" node. 1032 providerType := agentConfig.Value(agent.ProviderType) 1033 if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId { 1034 a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) { 1035 // TODO(axw) 2013-09-24 bug #1229507 1036 // Make another job to enable storage. 1037 // There's nothing special about this. 1038 return localstorage.NewWorker(agentConfig), nil 1039 }) 1040 } 1041 for _, job := range m.Jobs() { 1042 switch job { 1043 case state.JobHostUnits: 1044 // Implemented in APIWorker. 1045 case state.JobManageEnviron: 1046 useMultipleCPUs() 1047 a.startWorkerAfterUpgrade(runner, "env worker manager", func() (worker.Worker, error) { 1048 return envworkermanager.NewEnvWorkerManager(st, a.startEnvWorkers), nil 1049 }) 1050 a.startWorkerAfterUpgrade(runner, "peergrouper", func() (worker.Worker, error) { 1051 return peergrouperNew(st) 1052 }) 1053 a.startWorkerAfterUpgrade(runner, "restore", func() (worker.Worker, error) { 1054 return a.newRestoreStateWatcherWorker(st) 1055 }) 1056 1057 // certChangedChan is shared by multiple workers it's up 1058 // to the agent to close it rather than any one of the 1059 // workers. 1060 // 1061 // TODO(ericsnow) For now we simply do not close the channel. 1062 certChangedChan := make(chan params.StateServingInfo, 1) 1063 runner.StartWorker("apiserver", a.apiserverWorkerStarter(st, certChangedChan)) 1064 var stateServingSetter certupdater.StateServingInfoSetter = func(info params.StateServingInfo, done <-chan struct{}) error { 1065 return a.ChangeConfig(func(config agent.ConfigSetter) error { 1066 config.SetStateServingInfo(info) 1067 logger.Infof("update apiserver worker with new certificate") 1068 select { 1069 case certChangedChan <- info: 1070 return nil 1071 case <-done: 1072 return nil 1073 } 1074 }) 1075 } 1076 a.startWorkerAfterUpgrade(runner, "certupdater", func() (worker.Worker, error) { 1077 return newCertificateUpdater(m, agentConfig, st, st, stateServingSetter), nil 1078 }) 1079 1080 if feature.IsDbLogEnabled() { 1081 a.startWorkerAfterUpgrade(singularRunner, "dblogpruner", func() (worker.Worker, error) { 1082 return dblogpruner.New(st, dblogpruner.NewLogPruneParams()), nil 1083 }) 1084 } 1085 a.startWorkerAfterUpgrade(singularRunner, "statushistorypruner", func() (worker.Worker, error) { 1086 return statushistorypruner.New(st, statushistorypruner.NewHistoryPrunerParams()), nil 1087 }) 1088 1089 a.startWorkerAfterUpgrade(singularRunner, "txnpruner", func() (worker.Worker, error) { 1090 return txnpruner.New(st, time.Hour*2), nil 1091 }) 1092 1093 case state.JobManageStateDeprecated: 1094 // Legacy environments may set this, but we ignore it. 1095 default: 1096 logger.Warningf("ignoring unknown job %q", job) 1097 } 1098 } 1099 return cmdutil.NewCloseWorker(logger, runner, stateWorkerCloser{st}), nil 1100 } 1101 1102 type stateWorkerCloser struct { 1103 stateCloser io.Closer 1104 } 1105 1106 func (s stateWorkerCloser) Close() error { 1107 // This state-dependent data source will be useless once state is closed - 1108 // un-register it before closing state. 1109 unregisterSimplestreamsDataSource() 1110 return s.stateCloser.Close() 1111 } 1112 1113 // startEnvWorkers starts state server workers that need to run per 1114 // environment. 1115 func (a *MachineAgent) startEnvWorkers( 1116 ssSt envworkermanager.InitialState, 1117 st *state.State, 1118 ) (_ worker.Worker, err error) { 1119 envUUID := st.EnvironUUID() 1120 defer errors.DeferredAnnotatef(&err, "failed to start workers for env %s", envUUID) 1121 logger.Infof("starting workers for env %s", envUUID) 1122 1123 // Establish API connection for this environment. 1124 agentConfig := a.CurrentConfig() 1125 apiInfo := agentConfig.APIInfo() 1126 apiInfo.EnvironTag = st.EnvironTag() 1127 apiSt, err := apicaller.OpenAPIStateUsingInfo(apiInfo, agentConfig.OldPassword()) 1128 if err != nil { 1129 return nil, errors.Trace(err) 1130 } 1131 1132 // Create a runner for workers specific to this 1133 // environment. Either the State or API connection failing will be 1134 // considered fatal, killing the runner and all its workers. 1135 runner := newConnRunner(st, apiSt) 1136 defer func() { 1137 if err != nil && runner != nil { 1138 runner.Kill() 1139 runner.Wait() 1140 } 1141 }() 1142 // Close the API connection when the runner for this environment dies. 1143 go func() { 1144 runner.Wait() 1145 err := apiSt.Close() 1146 if err != nil { 1147 logger.Errorf("failed to close API connection for env %s: %v", envUUID, err) 1148 } 1149 }() 1150 1151 // Create a singular runner for this environment. 1152 machine, err := ssSt.Machine(a.machineId) 1153 if err != nil { 1154 return nil, errors.Trace(err) 1155 } 1156 singularRunner, err := newSingularStateRunner(runner, ssSt, machine) 1157 if err != nil { 1158 return nil, errors.Trace(err) 1159 } 1160 defer func() { 1161 if err != nil && singularRunner != nil { 1162 singularRunner.Kill() 1163 singularRunner.Wait() 1164 } 1165 }() 1166 1167 // Start workers that depend on a *state.State. 1168 // TODO(fwereade): 2015-04-21 THIS SHALL NOT PASS 1169 // Seriously, these should all be using the API. 1170 singularRunner.StartWorker("minunitsworker", func() (worker.Worker, error) { 1171 return minunitsworker.NewMinUnitsWorker(st), nil 1172 }) 1173 1174 // Start workers that use an API connection. 1175 singularRunner.StartWorker("environ-provisioner", func() (worker.Worker, error) { 1176 return provisioner.NewEnvironProvisioner(apiSt.Provisioner(), agentConfig), nil 1177 }) 1178 singularRunner.StartWorker("environ-storageprovisioner", func() (worker.Worker, error) { 1179 scope := st.EnvironTag() 1180 api := apiSt.StorageProvisioner(scope) 1181 return newStorageWorker( 1182 scope, "", api, api, api, api, api, api, 1183 clock.WallClock, 1184 ), nil 1185 }) 1186 singularRunner.StartWorker("charm-revision-updater", func() (worker.Worker, error) { 1187 return charmrevisionworker.NewRevisionUpdateWorker(apiSt.CharmRevisionUpdater()), nil 1188 }) 1189 runner.StartWorker("metricmanagerworker", func() (worker.Worker, error) { 1190 return metricworker.NewMetricsManager(getMetricAPI(apiSt)) 1191 }) 1192 singularRunner.StartWorker("instancepoller", func() (worker.Worker, error) { 1193 return newInstancePoller(apiSt.InstancePoller()), nil 1194 }) 1195 singularRunner.StartWorker("cleaner", func() (worker.Worker, error) { 1196 return newCleaner(apiSt.Cleaner()), nil 1197 }) 1198 singularRunner.StartWorker("addresserworker", func() (worker.Worker, error) { 1199 return newAddresser(apiSt.Addresser()) 1200 }) 1201 1202 // TODO(axw) 2013-09-24 bug #1229506 1203 // Make another job to enable the firewaller. Not all 1204 // environments are capable of managing ports 1205 // centrally. 1206 fwMode, err := getFirewallMode(apiSt) 1207 if err != nil { 1208 return nil, errors.Annotate(err, "cannot get firewall mode") 1209 } 1210 if fwMode != config.FwNone { 1211 singularRunner.StartWorker("firewaller", func() (worker.Worker, error) { 1212 return newFirewaller(apiSt.Firewaller()) 1213 }) 1214 } else { 1215 logger.Debugf("not starting firewaller worker - firewall-mode is %q", fwMode) 1216 } 1217 1218 return runner, nil 1219 } 1220 1221 var getFirewallMode = _getFirewallMode 1222 1223 func _getFirewallMode(apiSt api.Connection) (string, error) { 1224 envConfig, err := apiSt.Environment().EnvironConfig() 1225 if err != nil { 1226 return "", errors.Annotate(err, "cannot read environment config") 1227 } 1228 return envConfig.FirewallMode(), nil 1229 } 1230 1231 // stateWorkerDialOpts is a mongo.DialOpts suitable 1232 // for use by StateWorker to dial mongo. 1233 // 1234 // This must be overridden in tests, as it assumes 1235 // journaling is enabled. 1236 var stateWorkerDialOpts mongo.DialOpts 1237 1238 func (a *MachineAgent) apiserverWorkerStarter(st *state.State, certChanged chan params.StateServingInfo) func() (worker.Worker, error) { 1239 return func() (worker.Worker, error) { return a.newApiserverWorker(st, certChanged) } 1240 } 1241 1242 func (a *MachineAgent) newApiserverWorker(st *state.State, certChanged chan params.StateServingInfo) (worker.Worker, error) { 1243 agentConfig := a.CurrentConfig() 1244 // If the configuration does not have the required information, 1245 // it is currently not a recoverable error, so we kill the whole 1246 // agent, potentially enabling human intervention to fix 1247 // the agent's configuration file. 1248 info, ok := agentConfig.StateServingInfo() 1249 if !ok { 1250 return nil, &cmdutil.FatalError{"StateServingInfo not available and we need it"} 1251 } 1252 cert := []byte(info.Cert) 1253 key := []byte(info.PrivateKey) 1254 1255 if len(cert) == 0 || len(key) == 0 { 1256 return nil, &cmdutil.FatalError{"configuration does not have state server cert/key"} 1257 } 1258 tag := agentConfig.Tag() 1259 dataDir := agentConfig.DataDir() 1260 logDir := agentConfig.LogDir() 1261 1262 endpoint := net.JoinHostPort("", strconv.Itoa(info.APIPort)) 1263 listener, err := net.Listen("tcp", endpoint) 1264 if err != nil { 1265 return nil, err 1266 } 1267 return apiserver.NewServer(st, listener, apiserver.ServerConfig{ 1268 Cert: cert, 1269 Key: key, 1270 Tag: tag, 1271 DataDir: dataDir, 1272 LogDir: logDir, 1273 Validator: a.limitLogins, 1274 CertChanged: certChanged, 1275 }) 1276 } 1277 1278 // limitLogins is called by the API server for each login attempt. 1279 // it returns an error if upgrades or restore are running. 1280 func (a *MachineAgent) limitLogins(req params.LoginRequest) error { 1281 if err := a.limitLoginsDuringRestore(req); err != nil { 1282 return err 1283 } 1284 return a.limitLoginsDuringUpgrade(req) 1285 } 1286 1287 // limitLoginsDuringRestore will only allow logins for restore related purposes 1288 // while the different steps of restore are running. 1289 func (a *MachineAgent) limitLoginsDuringRestore(req params.LoginRequest) error { 1290 var err error 1291 switch { 1292 case a.IsRestoreRunning(): 1293 err = apiserver.RestoreInProgressError 1294 case a.IsRestorePreparing(): 1295 err = apiserver.AboutToRestoreError 1296 } 1297 if err != nil { 1298 authTag, parseErr := names.ParseTag(req.AuthTag) 1299 if parseErr != nil { 1300 return errors.Annotate(err, "could not parse auth tag") 1301 } 1302 switch authTag := authTag.(type) { 1303 case names.UserTag: 1304 // use a restricted API mode 1305 return err 1306 case names.MachineTag: 1307 if authTag == a.Tag() { 1308 // allow logins from the local machine 1309 return nil 1310 } 1311 } 1312 return errors.Errorf("login for %q blocked because restore is in progress", authTag) 1313 } 1314 return nil 1315 } 1316 1317 // limitLoginsDuringUpgrade is called by the API server for each login 1318 // attempt. It returns an error if upgrades are in progress unless the 1319 // login is for a user (i.e. a client) or the local machine. 1320 func (a *MachineAgent) limitLoginsDuringUpgrade(req params.LoginRequest) error { 1321 if a.upgradeWorkerContext.IsUpgradeRunning() || a.isAgentUpgradePending() { 1322 authTag, err := names.ParseTag(req.AuthTag) 1323 if err != nil { 1324 return errors.Annotate(err, "could not parse auth tag") 1325 } 1326 switch authTag := authTag.(type) { 1327 case names.UserTag: 1328 // use a restricted API mode 1329 return apiserver.UpgradeInProgressError 1330 case names.MachineTag: 1331 if authTag == a.Tag() { 1332 // allow logins from the local machine 1333 return nil 1334 } 1335 } 1336 return errors.Errorf("login for %q blocked because %s", authTag, apiserver.UpgradeInProgressError.Error()) 1337 } else { 1338 return nil // allow all logins 1339 } 1340 } 1341 1342 var stateWorkerServingConfigErr = errors.New("state worker started with no state serving info") 1343 1344 // ensureMongoServer ensures that mongo is installed and running, 1345 // and ready for opening a state connection. 1346 func (a *MachineAgent) ensureMongoServer(agentConfig agent.Config) (err error) { 1347 a.mongoInitMutex.Lock() 1348 defer a.mongoInitMutex.Unlock() 1349 if a.mongoInitialized { 1350 logger.Debugf("mongo is already initialized") 1351 return nil 1352 } 1353 defer func() { 1354 if err == nil { 1355 a.mongoInitialized = true 1356 } 1357 }() 1358 1359 // Many of the steps here, such as adding the state server to the 1360 // admin DB and initiating the replicaset, are once-only actions, 1361 // required when upgrading from a pre-HA-capable 1362 // environment. These calls won't do anything if the thing they 1363 // need to set up has already been done. 1364 var needReplicasetInit = false 1365 var machineAddrs []network.Address 1366 1367 mongoInstalled, err := mongo.IsServiceInstalled(agentConfig.Value(agent.Namespace)) 1368 if err != nil { 1369 return errors.Annotate(err, "error while checking if mongodb service is installed") 1370 } 1371 1372 if mongoInstalled { 1373 logger.Debugf("mongodb service is installed") 1374 1375 if _, err := a.ensureMongoAdminUser(agentConfig); err != nil { 1376 return errors.Trace(err) 1377 } 1378 1379 if err := a.ensureMongoSharedSecret(agentConfig); err != nil { 1380 return errors.Trace(err) 1381 } 1382 agentConfig = a.CurrentConfig() // ensureMongoSharedSecret may have updated the config 1383 1384 mongoInfo, ok := agentConfig.MongoInfo() 1385 if !ok { 1386 return errors.New("unable to retrieve mongo info to check replicaset") 1387 } 1388 1389 needReplicasetInit, err = isReplicasetInitNeeded(mongoInfo) 1390 if err != nil { 1391 return errors.Annotate(err, "error while checking replicaset") 1392 } 1393 1394 // If the replicaset is to be initialised the machine addresses 1395 // need to be retrieved *before* MongoDB is restarted with the 1396 // --replset option (in EnsureMongoServer). Once MongoDB is 1397 // started with --replset it won't respond to queries until the 1398 // replicaset is initiated. 1399 if needReplicasetInit { 1400 logger.Infof("replicaset not yet configured") 1401 machineAddrs, err = getMachineAddresses(agentConfig) 1402 if err != nil { 1403 return errors.Trace(err) 1404 } 1405 } 1406 } 1407 1408 // EnsureMongoServer installs/upgrades the init config as necessary. 1409 ensureServerParams, err := cmdutil.NewEnsureServerParams(agentConfig) 1410 if err != nil { 1411 return err 1412 } 1413 if err := cmdutil.EnsureMongoServer(ensureServerParams); err != nil { 1414 return err 1415 } 1416 1417 // Initiate the replicaset if required. 1418 if needReplicasetInit { 1419 servingInfo, ok := agentConfig.StateServingInfo() 1420 if !ok { 1421 return stateWorkerServingConfigErr 1422 } 1423 mongoInfo, ok := agentConfig.MongoInfo() 1424 if !ok { 1425 return errors.New("unable to retrieve mongo info to initiate replicaset") 1426 } 1427 if err := initiateReplicaSet(mongoInfo, servingInfo.StatePort, machineAddrs); err != nil { 1428 return err 1429 } 1430 } 1431 1432 return nil 1433 } 1434 1435 // ensureMongoAdminUser ensures that the machine's mongo user is in 1436 // the admin DB. 1437 func (a *MachineAgent) ensureMongoAdminUser(agentConfig agent.Config) (added bool, err error) { 1438 mongoInfo, ok1 := agentConfig.MongoInfo() 1439 servingInfo, ok2 := agentConfig.StateServingInfo() 1440 if !ok1 || !ok2 { 1441 return false, stateWorkerServingConfigErr 1442 } 1443 dialInfo, err := mongo.DialInfo(mongoInfo.Info, mongo.DefaultDialOpts()) 1444 if err != nil { 1445 return false, err 1446 } 1447 if len(dialInfo.Addrs) > 1 { 1448 logger.Infof("more than one state server; admin user must exist") 1449 return false, nil 1450 } 1451 return ensureMongoAdminUser(mongo.EnsureAdminUserParams{ 1452 DialInfo: dialInfo, 1453 Namespace: agentConfig.Value(agent.Namespace), 1454 DataDir: agentConfig.DataDir(), 1455 Port: servingInfo.StatePort, 1456 User: mongoInfo.Tag.String(), 1457 Password: mongoInfo.Password, 1458 }) 1459 } 1460 1461 // ensureMongoSharedSecret generates a MongoDB shared secret if 1462 // required, updating the agent's config and state. 1463 func (a *MachineAgent) ensureMongoSharedSecret(agentConfig agent.Config) error { 1464 servingInfo, ok := agentConfig.StateServingInfo() 1465 if !ok { 1466 return stateWorkerServingConfigErr 1467 } 1468 1469 if servingInfo.SharedSecret != "" { 1470 return nil // Already done 1471 } 1472 1473 logger.Infof("state serving info has no shared secret - generating") 1474 1475 var err error 1476 servingInfo.SharedSecret, err = mongo.GenerateSharedSecret() 1477 if err != nil { 1478 return err 1479 } 1480 logger.Debugf("updating state serving info in agent config") 1481 if err = a.ChangeConfig(func(config agent.ConfigSetter) error { 1482 config.SetStateServingInfo(servingInfo) 1483 return nil 1484 }); err != nil { 1485 return err 1486 } 1487 agentConfig = a.CurrentConfig() 1488 1489 logger.Debugf("updating state serving info in state") 1490 1491 // Note: we set Direct=true in the mongo options because it's 1492 // possible that we've previously upgraded the mongo server's 1493 // configuration to form a replicaset, but failed to initiate it. 1494 dialOpts := mongo.DefaultDialOpts() 1495 dialOpts.Direct = true 1496 st, _, err := openState(agentConfig, dialOpts) 1497 if err != nil { 1498 return err 1499 } 1500 defer st.Close() 1501 1502 ssi := cmdutil.ParamsStateServingInfoToStateStateServingInfo(servingInfo) 1503 if err := st.SetStateServingInfo(ssi); err != nil { 1504 return errors.Errorf("cannot set state serving info: %v", err) 1505 } 1506 1507 logger.Infof("shared secret updated in state serving info") 1508 return nil 1509 } 1510 1511 // isReplicasetInitNeeded returns true if the replicaset needs to be 1512 // initiated. 1513 func isReplicasetInitNeeded(mongoInfo *mongo.MongoInfo) (bool, error) { 1514 dialInfo, err := mongo.DialInfo(mongoInfo.Info, mongo.DefaultDialOpts()) 1515 if err != nil { 1516 return false, errors.Annotate(err, "cannot generate dial info to check replicaset") 1517 } 1518 dialInfo.Username = mongoInfo.Tag.String() 1519 dialInfo.Password = mongoInfo.Password 1520 1521 session, err := mgo.DialWithInfo(dialInfo) 1522 if err != nil { 1523 return false, errors.Annotate(err, "cannot dial mongo to check replicaset") 1524 } 1525 defer session.Close() 1526 1527 cfg, err := replicaset.CurrentConfig(session) 1528 if err != nil { 1529 logger.Debugf("couldn't retrieve replicaset config (not fatal): %v", err) 1530 return true, nil 1531 } 1532 numMembers := len(cfg.Members) 1533 logger.Debugf("replicaset member count: %d", numMembers) 1534 return numMembers < 1, nil 1535 } 1536 1537 // getMachineAddresses connects to state to determine the machine's 1538 // network addresses. 1539 func getMachineAddresses(agentConfig agent.Config) ([]network.Address, error) { 1540 logger.Debugf("opening state to get machine addresses") 1541 dialOpts := mongo.DefaultDialOpts() 1542 dialOpts.Direct = true 1543 st, m, err := openState(agentConfig, dialOpts) 1544 if err != nil { 1545 return nil, errors.Annotate(err, "failed to open state to retrieve machine addresses") 1546 } 1547 defer st.Close() 1548 return m.Addresses(), nil 1549 } 1550 1551 // initiateReplicaSet connects to MongoDB and sets up the replicaset. 1552 func initiateReplicaSet(mongoInfo *mongo.MongoInfo, statePort int, machineAddrs []network.Address) error { 1553 peerAddr := mongo.SelectPeerAddress(machineAddrs) 1554 if peerAddr == "" { 1555 return errors.Errorf("no appropriate peer address found in %q", machineAddrs) 1556 } 1557 1558 dialInfo, err := mongo.DialInfo(mongoInfo.Info, mongo.DefaultDialOpts()) 1559 if err != nil { 1560 return errors.Annotate(err, "cannot generate dial info to initiate replicaset") 1561 } 1562 1563 if err := maybeInitiateMongoServer(peergrouper.InitiateMongoParams{ 1564 DialInfo: dialInfo, 1565 MemberHostPort: net.JoinHostPort(peerAddr, fmt.Sprint(statePort)), 1566 User: mongoInfo.Tag.String(), // TODO(dfc) InitiateMongoParams should take a Tag 1567 Password: mongoInfo.Password, 1568 }); err != nil && err != peergrouper.ErrReplicaSetAlreadyInitiated { 1569 return err 1570 } 1571 return nil 1572 } 1573 1574 func openState(agentConfig agent.Config, dialOpts mongo.DialOpts) (_ *state.State, _ *state.Machine, err error) { 1575 info, ok := agentConfig.MongoInfo() 1576 if !ok { 1577 return nil, nil, fmt.Errorf("no state info available") 1578 } 1579 st, err := state.Open(agentConfig.Environment(), info, dialOpts, environs.NewStatePolicy()) 1580 if err != nil { 1581 return nil, nil, err 1582 } 1583 defer func() { 1584 if err != nil { 1585 st.Close() 1586 } 1587 }() 1588 m0, err := st.FindEntity(agentConfig.Tag()) 1589 if err != nil { 1590 if errors.IsNotFound(err) { 1591 err = worker.ErrTerminateAgent 1592 } 1593 return nil, nil, err 1594 } 1595 m := m0.(*state.Machine) 1596 if m.Life() == state.Dead { 1597 return nil, nil, worker.ErrTerminateAgent 1598 } 1599 // Check the machine nonce as provisioned matches the agent.Conf value. 1600 if !m.CheckProvisioned(agentConfig.Nonce()) { 1601 // The agent is running on a different machine to the one it 1602 // should be according to state. It must stop immediately. 1603 logger.Errorf("running machine %v agent on inappropriate instance", m) 1604 return nil, nil, worker.ErrTerminateAgent 1605 } 1606 return st, m, nil 1607 } 1608 1609 // startWorkerAfterUpgrade starts a worker to run the specified child worker 1610 // but only after waiting for upgrades to complete. 1611 func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) { 1612 runner.StartWorker(name, func() (worker.Worker, error) { 1613 return a.upgradeWaiterWorker(name, start), nil 1614 }) 1615 } 1616 1617 // upgradeWaiterWorker runs the specified worker after upgrades have completed. 1618 func (a *MachineAgent) upgradeWaiterWorker(name string, start func() (worker.Worker, error)) worker.Worker { 1619 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 1620 // Wait for the agent upgrade and upgrade steps to complete (or for us to be stopped). 1621 for _, ch := range []chan struct{}{ 1622 a.upgradeWorkerContext.UpgradeComplete, 1623 a.initialAgentUpgradeCheckComplete, 1624 } { 1625 select { 1626 case <-stop: 1627 return nil 1628 case <-ch: 1629 } 1630 } 1631 logger.Debugf("upgrades done, starting worker %q", name) 1632 1633 // Upgrades are done, start the worker. 1634 worker, err := start() 1635 if err != nil { 1636 return err 1637 } 1638 // Wait for worker to finish or for us to be stopped. 1639 waitCh := make(chan error) 1640 go func() { 1641 waitCh <- worker.Wait() 1642 }() 1643 select { 1644 case err := <-waitCh: 1645 logger.Debugf("worker %q exited with %v", name, err) 1646 return err 1647 case <-stop: 1648 logger.Debugf("stopping so killing worker %q", name) 1649 worker.Kill() 1650 } 1651 return <-waitCh // Ensure worker has stopped before returning. 1652 }) 1653 } 1654 1655 func (a *MachineAgent) setMachineStatus(apiState api.Connection, status params.Status, info string) error { 1656 tag := a.Tag().(names.MachineTag) 1657 machine, err := apiState.Machiner().Machine(tag) 1658 if err != nil { 1659 return errors.Trace(err) 1660 } 1661 if err := machine.SetStatus(status, info, nil); err != nil { 1662 return errors.Trace(err) 1663 } 1664 return nil 1665 } 1666 1667 // WorkersStarted returns a channel that's closed once all top level workers 1668 // have been started. This is provided for testing purposes. 1669 func (a *MachineAgent) WorkersStarted() <-chan struct{} { 1670 return a.workersStarted 1671 } 1672 1673 func (a *MachineAgent) Tag() names.Tag { 1674 return names.NewMachineTag(a.machineId) 1675 } 1676 1677 func (a *MachineAgent) createJujuRun(dataDir string) error { 1678 // TODO do not remove the symlink if it already points 1679 // to the right place. 1680 if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) { 1681 return err 1682 } 1683 jujud := filepath.Join(dataDir, "tools", a.Tag().String(), jujunames.Jujud) 1684 return symlink.New(jujud, JujuRun) 1685 } 1686 1687 func (a *MachineAgent) uninstallAgent(agentConfig agent.Config) error { 1688 var errors []error 1689 agentServiceName := agentConfig.Value(agent.AgentServiceName) 1690 if agentServiceName == "" { 1691 // For backwards compatibility, handle lack of AgentServiceName. 1692 agentServiceName = os.Getenv("UPSTART_JOB") 1693 } 1694 if agentServiceName != "" { 1695 svc, err := service.DiscoverService(agentServiceName, common.Conf{}) 1696 if err != nil { 1697 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1698 } else if err := svc.Remove(); err != nil { 1699 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 1700 } 1701 } 1702 1703 // Remove the juju-run symlink. 1704 if err := os.Remove(JujuRun); err != nil && !os.IsNotExist(err) { 1705 errors = append(errors, err) 1706 } 1707 1708 insideLXC, err := lxcutils.RunningInsideLXC() 1709 if err != nil { 1710 errors = append(errors, err) 1711 } else if insideLXC { 1712 // We're running inside LXC, so loop devices may leak. Detach 1713 // any loop devices that are backed by files on this machine. 1714 // 1715 // It is necessary to do this here as well as in container/lxc, 1716 // as container/lxc needs to check in the container's rootfs 1717 // to see if the loop device is attached to the container; that 1718 // will fail if the data-dir is removed first. 1719 if err := a.loopDeviceManager.DetachLoopDevices("/", agentConfig.DataDir()); err != nil { 1720 errors = append(errors, err) 1721 } 1722 } 1723 1724 namespace := agentConfig.Value(agent.Namespace) 1725 if err := mongo.RemoveService(namespace); err != nil { 1726 errors = append(errors, fmt.Errorf("cannot stop/remove mongo service with namespace %q: %v", namespace, err)) 1727 } 1728 if err := os.RemoveAll(agentConfig.DataDir()); err != nil { 1729 errors = append(errors, err) 1730 } 1731 if len(errors) == 0 { 1732 return nil 1733 } 1734 return fmt.Errorf("uninstall failed: %v", errors) 1735 } 1736 1737 func newConnRunner(conns ...cmdutil.Pinger) worker.Runner { 1738 return worker.NewRunner(cmdutil.ConnectionIsFatal(logger, conns...), cmdutil.MoreImportant) 1739 } 1740 1741 type MongoSessioner interface { 1742 MongoSession() *mgo.Session 1743 } 1744 1745 func newSingularStateRunner(runner worker.Runner, st MongoSessioner, m *state.Machine) (worker.Runner, error) { 1746 singularStateConn := singularStateConn{st.MongoSession(), m} 1747 singularRunner, err := newSingularRunner(runner, singularStateConn) 1748 if err != nil { 1749 return nil, errors.Annotate(err, "cannot make singular State Runner") 1750 } 1751 return singularRunner, err 1752 } 1753 1754 // singularStateConn implements singular.Conn on 1755 // top of a State connection. 1756 type singularStateConn struct { 1757 session *mgo.Session 1758 machine *state.Machine 1759 } 1760 1761 func (c singularStateConn) IsMaster() (bool, error) { 1762 return mongo.IsMaster(c.session, c.machine) 1763 } 1764 1765 func (c singularStateConn) Ping() error { 1766 return c.session.Ping() 1767 } 1768 1769 func metricAPI(st api.Connection) metricsmanager.MetricsManagerClient { 1770 return metricsmanager.NewClient(st) 1771 } 1772 1773 // newDeployContext gives the tests the opportunity to create a deployer.Context 1774 // that can be used for testing so as to avoid (1) deploying units to the system 1775 // running the tests and (2) get access to the *State used internally, so that 1776 // tests can be run without waiting for the 5s watcher refresh time to which we would 1777 // otherwise be restricted. 1778 var newDeployContext = func(st *apideployer.State, agentConfig agent.Config) deployer.Context { 1779 return deployer.NewSimpleContext(agentConfig, st) 1780 }