github.com/cloudbase/juju-core@v0.0.0-20140504232958-a7271ac7912f/cmd/jujud/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package main 5 6 import ( 7 "fmt" 8 "os" 9 "path/filepath" 10 "time" 11 12 "github.com/juju/loggo" 13 "launchpad.net/gnuflag" 14 "launchpad.net/tomb" 15 16 "launchpad.net/juju-core/agent" 17 "launchpad.net/juju-core/charm" 18 "launchpad.net/juju-core/juju/osenv" 19 "launchpad.net/juju-core/cmd" 20 "launchpad.net/juju-core/container/kvm" 21 "launchpad.net/juju-core/instance" 22 "launchpad.net/juju-core/names" 23 "launchpad.net/juju-core/provider" 24 "launchpad.net/juju-core/state" 25 "launchpad.net/juju-core/state/api" 26 apiagent "launchpad.net/juju-core/state/api/agent" 27 "launchpad.net/juju-core/state/api/params" 28 apiprovisioner "launchpad.net/juju-core/state/api/provisioner" 29 "launchpad.net/juju-core/state/apiserver" 30 "launchpad.net/juju-core/upgrades" 31 "launchpad.net/juju-core/upstart" 32 "launchpad.net/juju-core/version" 33 "launchpad.net/juju-core/worker" 34 "launchpad.net/juju-core/worker/cleaner" 35 "launchpad.net/juju-core/worker/instancepoller" 36 "launchpad.net/juju-core/worker/localstorage" 37 "launchpad.net/juju-core/worker/minunitsworker" 38 "launchpad.net/juju-core/worker/provisioner" 39 "launchpad.net/juju-core/worker/resumer" 40 "launchpad.net/juju-core/worker/terminationworker" 41 ) 42 43 var logger = loggo.GetLogger("juju.cmd.jujud") 44 45 var newRunner = func(isFatal func(error) bool, moreImportant func(e0, e1 error) bool) worker.Runner { 46 return worker.NewRunner(isFatal, moreImportant) 47 } 48 49 const bootstrapMachineId = "0" 50 51 var retryDelay = 3 * time.Second 52 53 var jujuRun = osenv.JujuRun 54 55 // MachineAgent is a cmd.Command responsible for running a machine agent. 56 type MachineAgent struct { 57 cmd.CommandBase 58 tomb tomb.Tomb 59 Conf AgentConf 60 MachineId string 61 runner worker.Runner 62 upgradeComplete chan struct{} 63 stateOpened chan struct{} 64 st *state.State 65 } 66 67 // Info returns usage information for the command. 68 func (a *MachineAgent) Info() *cmd.Info { 69 return &cmd.Info{ 70 Name: "machine", 71 Purpose: "run a juju machine agent", 72 } 73 } 74 75 func (a *MachineAgent) SetFlags(f *gnuflag.FlagSet) { 76 a.Conf.addFlags(f) 77 f.StringVar(&a.MachineId, "machine-id", "", "id of the machine to run") 78 } 79 80 // Init initializes the command for running. 81 func (a *MachineAgent) Init(args []string) error { 82 if !names.IsMachine(a.MachineId) { 83 return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer") 84 } 85 if err := a.Conf.checkArgs(args); err != nil { 86 return err 87 } 88 a.runner = newRunner(isFatal, moreImportant) 89 a.upgradeComplete = make(chan struct{}) 90 a.stateOpened = make(chan struct{}) 91 return nil 92 } 93 94 // Wait waits for the machine agent to finish. 95 func (a *MachineAgent) Wait() error { 96 return a.tomb.Wait() 97 } 98 99 // Stop stops the machine agent. 100 func (a *MachineAgent) Stop() error { 101 a.runner.Kill() 102 return a.tomb.Wait() 103 } 104 105 // Run runs a machine agent. 106 func (a *MachineAgent) Run(_ *cmd.Context) error { 107 // Due to changes in the logging, and needing to care about old 108 // environments that have been upgraded, we need to explicitly remove the 109 // file writer if one has been added, otherwise we will get duplicate 110 // lines of all logging in the log file. 111 loggo.RemoveWriter("logfile") 112 defer a.tomb.Done() 113 logger.Infof("machine agent %v start (%s)", a.Tag(), version.Current) 114 if err := a.Conf.read(a.Tag()); err != nil { 115 return err 116 } 117 charm.CacheDir = filepath.Join(a.Conf.dataDir, "charmcache") 118 if err := a.initAgent(); err != nil { 119 return err 120 } 121 122 // ensureStateWorker ensures that there is a worker that 123 // connects to the state that runs within itself all the workers 124 // that need a state connection. Unless we're bootstrapping, we 125 // need to connect to the API server to find out if we need to 126 // call this, so we make the APIWorker call it when necessary if 127 // the machine requires it. Note that ensureStateWorker can be 128 // called many times - StartWorker does nothing if there is 129 // already a worker started with the given name. 130 ensureStateWorker := func() { 131 a.runner.StartWorker("state", a.StateWorker) 132 } 133 // We might be bootstrapping, and the API server is not 134 // running yet. If so, make sure we run a state worker instead. 135 if a.MachineId == bootstrapMachineId { 136 // TODO(rog) When we have HA, we only want to do this 137 // when we really are bootstrapping - once other 138 // instances of the API server have been started, we 139 // should follow the normal course of things and ignore 140 // the fact that this was once the bootstrap machine. 141 logger.Infof("Starting StateWorker for machine-0") 142 ensureStateWorker() 143 } 144 a.runner.StartWorker("api", func() (worker.Worker, error) { 145 return a.APIWorker(ensureStateWorker) 146 }) 147 a.runner.StartWorker("termination", func() (worker.Worker, error) { 148 return terminationworker.NewWorker(), nil 149 }) 150 err := a.runner.Wait() 151 if err == worker.ErrTerminateAgent { 152 err = a.uninstallAgent() 153 } 154 err = agentDone(err) 155 a.tomb.Kill(err) 156 return err 157 } 158 159 // setupContainerSupport determines what containers can be run on this machine and 160 // initialises suitable infrastructure to support such containers. 161 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity) error { 162 var supportedContainers []instance.ContainerType 163 // We don't yet support nested lxc containers but anything else can run an LXC container. 164 if entity.ContainerType() != instance.LXC { 165 supportedContainers = append(supportedContainers, instance.LXC) 166 } 167 supportsKvm, err := kvm.IsKVMSupported() 168 if err != nil { 169 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 170 } 171 if err == nil && supportsKvm { 172 supportedContainers = append(supportedContainers, instance.KVM) 173 } 174 return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers) 175 } 176 177 // updateSupportedContainers records in state that a machine can run the specified containers. 178 // It starts a watcher and when a container of a given type is first added to the machine, 179 // the watcher is killed, the machine is set up to be able to start containers of the given type, 180 // and a suitable provisioner is started. 181 func (a *MachineAgent) updateSupportedContainers(runner worker.Runner, st *api.State, 182 tag string, containers []instance.ContainerType) error { 183 184 var machine *apiprovisioner.Machine 185 var err error 186 pr := st.Provisioner() 187 if machine, err = pr.Machine(tag); err != nil { 188 return fmt.Errorf("%s is not in state: %v", tag, err) 189 } 190 if len(containers) == 0 { 191 if err := machine.SupportsNoContainers(); err != nil { 192 return fmt.Errorf("clearing supported containers for %s: %v", tag, err) 193 } 194 return nil 195 } 196 if err := machine.SetSupportedContainers(containers...); err != nil { 197 return fmt.Errorf("setting supported containers for %s: %v", tag, err) 198 } 199 // Start the watcher to fire when a container is first requested on the machine. 200 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 201 handler := provisioner.NewContainerSetupHandler(runner, watcherName, containers, machine, pr, a.Conf.config) 202 a.startWorkerAfterUpgrade(runner, watcherName, func() (worker.Worker, error) { 203 return worker.NewStringsWorker(handler), nil 204 }) 205 return nil 206 } 207 208 // StateJobs returns a worker running all the workers that require 209 // a *state.State connection. 210 func (a *MachineAgent) StateWorker() (worker.Worker, error) { 211 agentConfig := a.Conf.config 212 st, entity, err := openState(agentConfig, a) 213 if err != nil { 214 return nil, err 215 } 216 a.st = st 217 close(a.stateOpened) 218 reportOpenedState(st) 219 m := entity.(*state.Machine) 220 221 runner := newRunner(connectionIsFatal(st), moreImportant) 222 // Take advantage of special knowledge here in that we will only ever want 223 // the storage provider on one machine, and that is the "bootstrap" node. 224 providerType := agentConfig.Value(agent.ProviderType) 225 if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId { 226 a.startWorkerAfterUpgrade(runner, "local-storage", func() (worker.Worker, error) { 227 // TODO(axw) 2013-09-24 bug #1229507 228 // Make another job to enable storage. 229 // There's nothing special about this. 230 return localstorage.NewWorker(agentConfig), nil 231 }) 232 } 233 for _, job := range m.Jobs() { 234 switch job { 235 case state.JobHostUnits: 236 // Implemented in APIWorker. 237 case state.JobManageEnviron: 238 a.startWorkerAfterUpgrade(runner, "instancepoller", func() (worker.Worker, error) { 239 return instancepoller.NewWorker(st), nil 240 }) 241 runner.StartWorker("apiserver", func() (worker.Worker, error) { 242 // If the configuration does not have the required information, 243 // it is currently not a recoverable error, so we kill the whole 244 // agent, potentially enabling human intervention to fix 245 // the agent's configuration file. In the future, we may retrieve 246 // the state server certificate and key from the state, and 247 // this should then change. 248 port, cert, key := a.Conf.config.APIServerDetails() 249 if len(cert) == 0 || len(key) == 0 { 250 return nil, &fatalError{"configuration does not have state server cert/key"} 251 } 252 dataDir := a.Conf.config.DataDir() 253 return apiserver.NewServer(st, fmt.Sprintf(":%d", port), cert, key, dataDir) 254 }) 255 a.startWorkerAfterUpgrade(runner, "cleaner", func() (worker.Worker, error) { 256 return cleaner.NewCleaner(st), nil 257 }) 258 a.startWorkerAfterUpgrade(runner, "resumer", func() (worker.Worker, error) { 259 // The action of resumer is so subtle that it is not tested, 260 // because we can't figure out how to do so without brutalising 261 // the transaction log. 262 return resumer.NewResumer(st), nil 263 }) 264 a.startWorkerAfterUpgrade(runner, "minunitsworker", func() (worker.Worker, error) { 265 return minunitsworker.NewMinUnitsWorker(st), nil 266 }) 267 case state.JobManageStateDeprecated: 268 // Legacy environments may set this, but we ignore it. 269 default: 270 logger.Warningf("ignoring unknown job %q", job) 271 } 272 } 273 return newCloseWorker(runner, st), nil 274 } 275 276 // startWorker starts a worker to run the specified child worker but only after waiting for upgrades to complete. 277 func (a *MachineAgent) startWorkerAfterUpgrade(runner worker.Runner, name string, start func() (worker.Worker, error)) { 278 runner.StartWorker(name, func() (worker.Worker, error) { 279 return a.upgradeWaiterWorker(start), nil 280 }) 281 } 282 283 // upgradeWaiterWorker runs the specified worker after upgrades have completed. 284 func (a *MachineAgent) upgradeWaiterWorker(start func() (worker.Worker, error)) worker.Worker { 285 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 286 // wait for the upgrade to complete (or for us to be stopped) 287 select { 288 case <-stop: 289 return nil 290 case <-a.upgradeComplete: 291 } 292 w, err := start() 293 if err != nil { 294 return err 295 } 296 waitCh := make(chan error) 297 go func() { 298 waitCh <- w.Wait() 299 }() 300 select { 301 case err := <-waitCh: 302 return err 303 case <-stop: 304 w.Kill() 305 } 306 return <-waitCh 307 }) 308 } 309 310 // upgradeWorker runs the required upgrade operations to upgrade to the current Juju version. 311 func (a *MachineAgent) upgradeWorker(apiState *api.State, jobs []params.MachineJob) worker.Worker { 312 return worker.NewSimpleWorker(func(stop <-chan struct{}) error { 313 select { 314 case <-a.upgradeComplete: 315 // Our work is already done (we're probably being restarted 316 // because the API connection has gone down), so do nothing. 317 <-stop 318 return nil 319 default: 320 } 321 // If the machine agent is a state server, wait until state is opened. 322 var st *state.State 323 for _, job := range jobs { 324 if job == params.JobManageEnviron { 325 select { 326 case <-a.stateOpened: 327 } 328 st = a.st 329 break 330 } 331 } 332 err := a.runUpgrades(st, apiState, jobs) 333 if err != nil { 334 return err 335 } 336 logger.Infof("Upgrade to %v completed.", version.Current) 337 close(a.upgradeComplete) 338 <-stop 339 return nil 340 }) 341 } 342 343 // runUpgrades runs the upgrade operations for each job type and updates the updatedToVersion on success. 344 func (a *MachineAgent) runUpgrades(st *state.State, apiState *api.State, jobs []params.MachineJob) error { 345 agentConfig := a.Conf.config 346 from := version.Current 347 from.Number = agentConfig.UpgradedToVersion() 348 if from == version.Current { 349 logger.Infof("Upgrade to %v already completed.", version.Current) 350 return nil 351 } 352 context := upgrades.NewContext(agentConfig, apiState, st) 353 for _, job := range jobs { 354 var target upgrades.Target 355 switch job { 356 case params.JobManageEnviron: 357 target = upgrades.StateServer 358 case params.JobHostUnits: 359 target = upgrades.HostMachine 360 default: 361 continue 362 } 363 logger.Infof("Starting upgrade from %v to %v for %v", from, version.Current, target) 364 if err := upgrades.PerformUpgrade(from.Number, target, context); err != nil { 365 return fmt.Errorf("cannot perform upgrade from %v to %v for %v: %v", from, version.Current, target, err) 366 } 367 } 368 return a.Conf.config.WriteUpgradedToVersion(version.Current.Number) 369 } 370 371 func (a *MachineAgent) Entity(st *state.State) (AgentState, error) { 372 m, err := st.Machine(a.MachineId) 373 if err != nil { 374 return nil, err 375 } 376 // Check the machine nonce as provisioned matches the agent.Conf value. 377 if !m.CheckProvisioned(a.Conf.config.Nonce()) { 378 // The agent is running on a different machine to the one it 379 // should be according to state. It must stop immediately. 380 logger.Errorf("running machine %v agent on inappropriate instance", m) 381 return nil, worker.ErrTerminateAgent 382 } 383 return m, nil 384 } 385 386 func (a *MachineAgent) Tag() string { 387 return names.MachineTag(a.MachineId) 388 } 389 390 func (a *MachineAgent) uninstallAgent() error { 391 var errors []error 392 agentServiceName := a.Conf.config.Value(agent.AgentServiceName) 393 if agentServiceName == "" { 394 // For backwards compatibility, handle lack of AgentServiceName. 395 agentServiceName = os.Getenv("UPSTART_JOB") 396 } 397 if agentServiceName != "" { 398 if err := upstart.NewService(agentServiceName).Remove(); err != nil { 399 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 400 } 401 } 402 // Remove the juju-run symlink. 403 if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) { 404 errors = append(errors, err) 405 } 406 // The machine agent may terminate without knowing its jobs, 407 // for example if the machine's entry in state was removed. 408 // Thus, we do not rely on jobs here, and instead just check 409 // if the upstart config exists. 410 mongoServiceName := a.Conf.config.Value(agent.MongoServiceName) 411 if mongoServiceName != "" { 412 if err := upstart.NewService(mongoServiceName).StopAndRemove(); err != nil { 413 errors = append(errors, fmt.Errorf("cannot stop/remove service %q: %v", mongoServiceName, err)) 414 } 415 } 416 if err := os.RemoveAll(a.Conf.dataDir); err != nil { 417 errors = append(errors, err) 418 } 419 if len(errors) == 0 { 420 return nil 421 } 422 return fmt.Errorf("uninstall failed: %v", errors) 423 } 424 425 // Below pieces are used for testing,to give us access to the *State opened 426 // by the agent, and allow us to trigger syncs without waiting 5s for them 427 // to happen automatically. 428 429 var stateReporter chan<- *state.State 430 431 func reportOpenedState(st *state.State) { 432 select { 433 case stateReporter <- st: 434 default: 435 } 436 } 437 438 func sendOpenedStates(dst chan<- *state.State) (undo func()) { 439 var original chan<- *state.State 440 original, stateReporter = stateReporter, dst 441 return func() { stateReporter = original } 442 } 443 444 var apiReporter chan<- *api.State 445 446 func reportOpenedAPI(st *api.State) { 447 select { 448 case apiReporter <- st: 449 default: 450 } 451 } 452 func sendOpenedAPIs(dst chan<- *api.State) (undo func()) { 453 var original chan<- *api.State 454 original, apiReporter = apiReporter, dst 455 return func() { apiReporter = original } 456 }