launchpad.net/~rogpeppe/juju-core/500-errgo-fix@v0.0.0-20140213181702-000000002356/cmd/jujud/machine.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package main 5 6 import ( 7 "fmt" 8 "os" 9 "path/filepath" 10 "time" 11 12 "github.com/loggo/loggo" 13 "launchpad.net/errgo/errors" 14 "launchpad.net/gnuflag" 15 "launchpad.net/tomb" 16 17 "launchpad.net/juju-core/agent" 18 "launchpad.net/juju-core/charm" 19 "launchpad.net/juju-core/cmd" 20 "launchpad.net/juju-core/container/kvm" 21 "launchpad.net/juju-core/instance" 22 "launchpad.net/juju-core/log/syslog" 23 "launchpad.net/juju-core/names" 24 "launchpad.net/juju-core/provider" 25 "launchpad.net/juju-core/state" 26 "launchpad.net/juju-core/state/api" 27 apiagent "launchpad.net/juju-core/state/api/agent" 28 "launchpad.net/juju-core/state/api/params" 29 apiprovisioner "launchpad.net/juju-core/state/api/provisioner" 30 "launchpad.net/juju-core/state/apiserver" 31 "launchpad.net/juju-core/upstart" 32 "launchpad.net/juju-core/worker" 33 "launchpad.net/juju-core/worker/authenticationworker" 34 "launchpad.net/juju-core/worker/charmrevisionworker" 35 "launchpad.net/juju-core/worker/cleaner" 36 "launchpad.net/juju-core/worker/deployer" 37 "launchpad.net/juju-core/worker/firewaller" 38 "launchpad.net/juju-core/worker/instancepoller" 39 "launchpad.net/juju-core/worker/localstorage" 40 workerlogger "launchpad.net/juju-core/worker/logger" 41 "launchpad.net/juju-core/worker/machineenvironmentworker" 42 "launchpad.net/juju-core/worker/machiner" 43 "launchpad.net/juju-core/worker/minunitsworker" 44 "launchpad.net/juju-core/worker/provisioner" 45 "launchpad.net/juju-core/worker/resumer" 46 "launchpad.net/juju-core/worker/terminationworker" 47 "launchpad.net/juju-core/worker/upgrader" 48 ) 49 50 var logger = loggo.GetLogger("juju.cmd.jujud") 51 52 var mask = errors.Mask 53 54 var newRunner = func(isFatal func(error) bool, moreImportant func(e0, e1 error) bool) worker.Runner { 55 return worker.NewRunner(isFatal, moreImportant) 56 } 57 58 const bootstrapMachineId = "0" 59 60 var retryDelay = 3 * time.Second 61 62 var jujuRun = "/usr/local/bin/juju-run" 63 64 // MachineAgent is a cmd.Command responsible for running a machine agent. 65 type MachineAgent struct { 66 cmd.CommandBase 67 tomb tomb.Tomb 68 Conf AgentConf 69 MachineId string 70 runner worker.Runner 71 } 72 73 // Info returns usage information for the command. 74 func (a *MachineAgent) Info() *cmd.Info { 75 return &cmd.Info{ 76 Name: "machine", 77 Purpose: "run a juju machine agent", 78 } 79 } 80 81 func (a *MachineAgent) SetFlags(f *gnuflag.FlagSet) { 82 a.Conf.addFlags(f) 83 f.StringVar(&a.MachineId, "machine-id", "", "id of the machine to run") 84 } 85 86 // Init initializes the command for running. 87 func (a *MachineAgent) Init(args []string) error { 88 if !names.IsMachine(a.MachineId) { 89 return fmt.Errorf("--machine-id option must be set, and expects a non-negative integer") 90 } 91 if err := a.Conf.checkArgs(args); err != nil { 92 return err 93 } 94 a.runner = newRunner(isFatal, moreImportant) 95 return nil 96 } 97 98 // Wait waits for the machine agent to finish. 99 func (a *MachineAgent) Wait() error { 100 return a.tomb.Wait() 101 } 102 103 // Stop stops the machine agent. 104 func (a *MachineAgent) Stop() error { 105 a.runner.Kill() 106 return a.tomb.Wait() 107 } 108 109 // Run runs a machine agent. 110 func (a *MachineAgent) Run(_ *cmd.Context) error { 111 // Due to changes in the logging, and needing to care about old 112 // environments that have been upgraded, we need to explicitly remove the 113 // file writer if one has been added, otherwise we will get duplicate 114 // lines of all logging in the log file. 115 loggo.RemoveWriter("logfile") 116 defer a.tomb.Done() 117 logger.Infof("machine agent %v start", a.Tag()) 118 if err := a.Conf.read(a.Tag()); err != nil { 119 return err 120 } 121 charm.CacheDir = filepath.Join(a.Conf.dataDir, "charmcache") 122 if err := a.initAgent(); err != nil { 123 return err 124 } 125 126 // ensureStateWorker ensures that there is a worker that 127 // connects to the state that runs within itself all the workers 128 // that need a state connection. Unless we're bootstrapping, we 129 // need to connect to the API server to find out if we need to 130 // call this, so we make the APIWorker call it when necessary if 131 // the machine requires it. Note that ensureStateWorker can be 132 // called many times - StartWorker does nothing if there is 133 // already a worker started with the given name. 134 ensureStateWorker := func() { 135 a.runner.StartWorker("state", a.StateWorker) 136 } 137 // We might be bootstrapping, and the API server is not 138 // running yet. If so, make sure we run a state worker instead. 139 if a.MachineId == bootstrapMachineId { 140 // TODO(rog) When we have HA, we only want to do this 141 // when we really are bootstrapping - once other 142 // instances of the API server have been started, we 143 // should follow the normal course of things and ignore 144 // the fact that this was once the bootstrap machine. 145 logger.Infof("Starting StateWorker for machine-0") 146 ensureStateWorker() 147 } 148 a.runner.StartWorker("api", func() (worker.Worker, error) { 149 return a.APIWorker(ensureStateWorker) 150 }) 151 a.runner.StartWorker("termination", func() (worker.Worker, error) { 152 return terminationworker.NewWorker(), nil 153 }) 154 err := a.runner.Wait() 155 if err == worker.ErrTerminateAgent { 156 err = a.uninstallAgent() 157 } 158 err = agentDone(err) 159 a.tomb.Kill(err) 160 return err 161 } 162 163 // APIWorker returns a Worker that connects to the API and starts any 164 // workers that need an API connection. 165 // 166 // If a state worker is necessary, APIWorker calls ensureStateWorker. 167 func (a *MachineAgent) APIWorker(ensureStateWorker func()) (worker.Worker, error) { 168 agentConfig := a.Conf.config 169 st, entity, err := openAPIState(agentConfig, a) 170 if err != nil { 171 return nil, err 172 } 173 reportOpenedAPI(st) 174 for _, job := range entity.Jobs() { 175 if job.NeedsState() { 176 ensureStateWorker() 177 break 178 } 179 } 180 runner := newRunner(connectionIsFatal(st), moreImportant) 181 runner.StartWorker("machiner", func() (worker.Worker, error) { 182 return machiner.NewMachiner(st.Machiner(), agentConfig), nil 183 }) 184 runner.StartWorker("upgrader", func() (worker.Worker, error) { 185 return upgrader.NewUpgrader(st.Upgrader(), agentConfig), nil 186 }) 187 runner.StartWorker("logger", func() (worker.Worker, error) { 188 return workerlogger.NewLogger(st.Logger(), agentConfig), nil 189 }) 190 runner.StartWorker("machineenvironmentworker", func() (worker.Worker, error) { 191 return machineenvironmentworker.NewMachineEnvironmentWorker(st.Environment(), agentConfig), nil 192 }) 193 194 // If not a local provider bootstrap machine, start the worker to manage SSH keys. 195 providerType := agentConfig.Value(agent.ProviderType) 196 if providerType != provider.Local || a.MachineId != bootstrapMachineId { 197 runner.StartWorker("authenticationworker", func() (worker.Worker, error) { 198 return authenticationworker.NewWorker(st.KeyUpdater(), agentConfig), nil 199 }) 200 } 201 202 // Perform the operations needed to set up hosting for containers. 203 if err := a.setupContainerSupport(runner, st, entity); err != nil { 204 return nil, fmt.Errorf("setting up container support: %v", err) 205 } 206 for _, job := range entity.Jobs() { 207 switch job { 208 case params.JobHostUnits: 209 runner.StartWorker("deployer", func() (worker.Worker, error) { 210 apiDeployer := st.Deployer() 211 context := newDeployContext(apiDeployer, agentConfig) 212 return deployer.NewDeployer(apiDeployer, context), nil 213 }) 214 case params.JobManageEnviron: 215 runner.StartWorker("environ-provisioner", func() (worker.Worker, error) { 216 return provisioner.NewEnvironProvisioner(st.Provisioner(), agentConfig), nil 217 }) 218 // TODO(axw) 2013-09-24 bug #1229506 219 // Make another job to enable the firewaller. Not all environments 220 // are capable of managing ports centrally. 221 runner.StartWorker("firewaller", func() (worker.Worker, error) { 222 return firewaller.NewFirewaller(st.Firewaller()) 223 }) 224 runner.StartWorker("charm-revision-updater", func() (worker.Worker, error) { 225 return charmrevisionworker.NewRevisionUpdateWorker(st.CharmRevisionUpdater()), nil 226 }) 227 case params.JobManageState: 228 // Legacy environments may set this, but we ignore it. 229 default: 230 // TODO(dimitern): Once all workers moved over to using 231 // the API, report "unknown job type" here. 232 } 233 } 234 return newCloseWorker(runner, st), nil // Note: a worker.Runner is itself a worker.Worker. 235 } 236 237 // setupContainerSupport determines what containers can be run on this machine and 238 // initialises suitable infrastructure to support such containers. 239 func (a *MachineAgent) setupContainerSupport(runner worker.Runner, st *api.State, entity *apiagent.Entity) error { 240 var supportedContainers []instance.ContainerType 241 // We don't yet support nested lxc containers but anything else can run an LXC container. 242 if entity.ContainerType() != instance.LXC { 243 supportedContainers = append(supportedContainers, instance.LXC) 244 } 245 supportsKvm, err := kvm.IsKVMSupported() 246 if err != nil { 247 logger.Warningf("determining kvm support: %v\nno kvm containers possible", err) 248 } 249 if err == nil && supportsKvm { 250 supportedContainers = append(supportedContainers, instance.KVM) 251 } 252 return a.updateSupportedContainers(runner, st, entity.Tag(), supportedContainers) 253 } 254 255 // updateSupportedContainers records in state that a machine can run the specified containers. 256 // It starts a watcher and when a container of a given type is first added to the machine, 257 // the watcher is killed, the machine is set up to be able to start containers of the given type, 258 // and a suitable provisioner is started. 259 func (a *MachineAgent) updateSupportedContainers(runner worker.Runner, st *api.State, 260 tag string, containers []instance.ContainerType) error { 261 262 var machine *apiprovisioner.Machine 263 var err error 264 pr := st.Provisioner() 265 if machine, err = pr.Machine(tag); err != nil { 266 return fmt.Errorf("%s is not in state: %v", tag, err) 267 } 268 if len(containers) == 0 { 269 if err := machine.SupportsNoContainers(); err != nil { 270 return fmt.Errorf("clearing supported containers for %s: %v", tag, err) 271 } 272 return nil 273 } 274 if err := machine.SetSupportedContainers(containers...); err != nil { 275 return fmt.Errorf("setting supported containers for %s: %v", tag, err) 276 } 277 // Start the watcher to fire when a container is first requested on the machine. 278 watcherName := fmt.Sprintf("%s-container-watcher", machine.Id()) 279 handler := provisioner.NewContainerSetupHandler(runner, watcherName, containers, machine, pr, a.Conf.config) 280 runner.StartWorker(watcherName, func() (worker.Worker, error) { 281 return worker.NewStringsWorker(handler), nil 282 }) 283 return nil 284 } 285 286 // StateJobs returns a worker running all the workers that require 287 // a *state.State connection. 288 func (a *MachineAgent) StateWorker() (worker.Worker, error) { 289 agentConfig := a.Conf.config 290 st, entity, err := openState(agentConfig, a) 291 if err != nil { 292 return nil, err 293 } 294 reportOpenedState(st) 295 m := entity.(*state.Machine) 296 297 runner := newRunner(connectionIsFatal(st), moreImportant) 298 // Take advantage of special knowledge here in that we will only ever want 299 // the storage provider on one machine, and that is the "bootstrap" node. 300 providerType := agentConfig.Value(agent.ProviderType) 301 if (providerType == provider.Local || provider.IsManual(providerType)) && m.Id() == bootstrapMachineId { 302 runner.StartWorker("local-storage", func() (worker.Worker, error) { 303 // TODO(axw) 2013-09-24 bug #1229507 304 // Make another job to enable storage. 305 // There's nothing special about this. 306 return localstorage.NewWorker(agentConfig), nil 307 }) 308 } 309 for _, job := range m.Jobs() { 310 switch job { 311 case state.JobHostUnits: 312 // Implemented in APIWorker. 313 case state.JobManageEnviron: 314 runner.StartWorker("instancepoller", func() (worker.Worker, error) { 315 return instancepoller.NewWorker(st), nil 316 }) 317 runner.StartWorker("apiserver", func() (worker.Worker, error) { 318 // If the configuration does not have the required information, 319 // it is currently not a recoverable error, so we kill the whole 320 // agent, potentially enabling human intervention to fix 321 // the agent's configuration file. In the future, we may retrieve 322 // the state server certificate and key from the state, and 323 // this should then change. 324 port, cert, key := a.Conf.config.APIServerDetails() 325 if len(cert) == 0 || len(key) == 0 { 326 return nil, &fatalError{"configuration does not have state server cert/key"} 327 } 328 dataDir := a.Conf.config.DataDir() 329 return apiserver.NewServer(st, fmt.Sprintf(":%d", port), cert, key, dataDir) 330 }) 331 runner.StartWorker("cleaner", func() (worker.Worker, error) { 332 return cleaner.NewCleaner(st), nil 333 }) 334 runner.StartWorker("resumer", func() (worker.Worker, error) { 335 // The action of resumer is so subtle that it is not tested, 336 // because we can't figure out how to do so without brutalising 337 // the transaction log. 338 return resumer.NewResumer(st), nil 339 }) 340 runner.StartWorker("minunitsworker", func() (worker.Worker, error) { 341 return minunitsworker.NewMinUnitsWorker(st), nil 342 }) 343 case state.JobManageState: 344 // Legacy environments may set this, but we ignore it. 345 default: 346 logger.Warningf("ignoring unknown job %q", job) 347 } 348 } 349 return newCloseWorker(runner, st), nil 350 } 351 352 func (a *MachineAgent) Entity(st *state.State) (AgentState, error) { 353 m, err := st.Machine(a.MachineId) 354 if err != nil { 355 return nil, err 356 } 357 // Check the machine nonce as provisioned matches the agent.Conf value. 358 if !m.CheckProvisioned(a.Conf.config.Nonce()) { 359 // The agent is running on a different machine to the one it 360 // should be according to state. It must stop immediately. 361 logger.Errorf("running machine %v agent on inappropriate instance", m) 362 return nil, worker.ErrTerminateAgent 363 } 364 return m, nil 365 } 366 367 func (a *MachineAgent) Tag() string { 368 return names.MachineTag(a.MachineId) 369 } 370 371 func (a *MachineAgent) initAgent() error { 372 if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) { 373 return err 374 } 375 jujud := filepath.Join(a.Conf.dataDir, "tools", a.Tag(), "jujud") 376 return os.Symlink(jujud, jujuRun) 377 } 378 379 func (a *MachineAgent) uninstallAgent() error { 380 var errors []error 381 agentServiceName := a.Conf.config.Value(agent.AgentServiceName) 382 if agentServiceName == "" { 383 // For backwards compatibility, handle lack of AgentServiceName. 384 agentServiceName = os.Getenv("UPSTART_JOB") 385 } 386 if agentServiceName != "" { 387 if err := upstart.NewService(agentServiceName).Remove(); err != nil { 388 errors = append(errors, fmt.Errorf("cannot remove service %q: %v", agentServiceName, err)) 389 } 390 } 391 // Remove the rsyslog conf file and restart rsyslogd. 392 if rsyslogConfPath := a.Conf.config.Value(agent.RsyslogConfPath); rsyslogConfPath != "" { 393 if err := os.Remove(rsyslogConfPath); err != nil { 394 errors = append(errors, err) 395 } 396 if err := syslog.Restart(); err != nil { 397 errors = append(errors, err) 398 } 399 } 400 // Remove the juju-run symlink. 401 if err := os.Remove(jujuRun); err != nil && !os.IsNotExist(err) { 402 errors = append(errors, err) 403 } 404 // The machine agent may terminate without knowing its jobs, 405 // for example if the machine's entry in state was removed. 406 // Thus, we do not rely on jobs here, and instead just check 407 // if the upstart config exists. 408 mongoServiceName := a.Conf.config.Value(agent.MongoServiceName) 409 if mongoServiceName != "" { 410 if err := upstart.NewService(mongoServiceName).StopAndRemove(); err != nil { 411 errors = append(errors, fmt.Errorf("cannot stop/remove service %q: %v", mongoServiceName, err)) 412 } 413 } 414 if err := os.RemoveAll(a.Conf.dataDir); err != nil { 415 errors = append(errors, err) 416 } 417 if len(errors) == 0 { 418 return nil 419 } 420 return fmt.Errorf("uninstall failed: %v", errors) 421 } 422 423 // Below pieces are used for testing,to give us access to the *State opened 424 // by the agent, and allow us to trigger syncs without waiting 5s for them 425 // to happen automatically. 426 427 var stateReporter chan<- *state.State 428 429 func reportOpenedState(st *state.State) { 430 select { 431 case stateReporter <- st: 432 default: 433 } 434 } 435 436 func sendOpenedStates(dst chan<- *state.State) (undo func()) { 437 var original chan<- *state.State 438 original, stateReporter = stateReporter, dst 439 return func() { stateReporter = original } 440 } 441 442 var apiReporter chan<- *api.State 443 444 func reportOpenedAPI(st *api.State) { 445 select { 446 case apiReporter <- st: 447 default: 448 } 449 } 450 func sendOpenedAPIs(dst chan<- *api.State) (undo func()) { 451 var original chan<- *api.State 452 original, apiReporter = apiReporter, dst 453 return func() { apiReporter = original } 454 }