launchpad.net/~rogpeppe/juju-core/500-errgo-fix@v0.0.0-20140213181702-000000002356/worker/provisioner/provisioner_task.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package provisioner 5 6 import ( 7 "fmt" 8 9 "launchpad.net/errgo/errors" 10 "launchpad.net/tomb" 11 12 "launchpad.net/juju-core/constraints" 13 "launchpad.net/juju-core/environs" 14 "launchpad.net/juju-core/environs/cloudinit" 15 "launchpad.net/juju-core/environs/tools" 16 "launchpad.net/juju-core/instance" 17 "launchpad.net/juju-core/names" 18 "launchpad.net/juju-core/state/api/params" 19 apiprovisioner "launchpad.net/juju-core/state/api/provisioner" 20 "launchpad.net/juju-core/state/watcher" 21 coretools "launchpad.net/juju-core/tools" 22 "launchpad.net/juju-core/utils" 23 "launchpad.net/juju-core/worker" 24 ) 25 26 type ProvisionerTask interface { 27 worker.Worker 28 Stop() error 29 Dying() <-chan struct{} 30 Err() error 31 32 // SetSafeMode sets a flag to indicate whether the provisioner task 33 // runs in safe mode or not. In safe mode, any running instances 34 // which do no exist in state are allowed to keep running rather than 35 // being shut down. 36 SetSafeMode(safeMode bool) 37 } 38 39 type Watcher interface { 40 watcher.Errer 41 watcher.Stopper 42 Changes() <-chan []string 43 } 44 45 type MachineGetter interface { 46 Machine(tag string) (*apiprovisioner.Machine, error) 47 } 48 49 func NewProvisionerTask( 50 machineTag string, 51 safeMode bool, 52 machineGetter MachineGetter, 53 watcher Watcher, 54 broker environs.InstanceBroker, 55 auth environs.AuthenticationProvider, 56 ) ProvisionerTask { 57 task := &provisionerTask{ 58 machineTag: machineTag, 59 machineGetter: machineGetter, 60 machineWatcher: watcher, 61 broker: broker, 62 auth: auth, 63 safeMode: safeMode, 64 safeModeChan: make(chan bool, 1), 65 machines: make(map[string]*apiprovisioner.Machine), 66 } 67 go func() { 68 defer task.tomb.Done() 69 task.tomb.Kill(task.loop()) 70 }() 71 return task 72 } 73 74 type provisionerTask struct { 75 machineTag string 76 machineGetter MachineGetter 77 machineWatcher Watcher 78 broker environs.InstanceBroker 79 tomb tomb.Tomb 80 auth environs.AuthenticationProvider 81 82 safeMode bool 83 safeModeChan chan bool 84 85 // instance id -> instance 86 instances map[instance.Id]instance.Instance 87 // machine id -> machine 88 machines map[string]*apiprovisioner.Machine 89 } 90 91 // Kill implements worker.Worker.Kill. 92 func (task *provisionerTask) Kill() { 93 task.tomb.Kill(nil) 94 } 95 96 // Wait implements worker.Worker.Wait. 97 func (task *provisionerTask) Wait() error { 98 return task.tomb.Wait() 99 } 100 101 func (task *provisionerTask) Stop() error { 102 task.Kill() 103 return task.Wait() 104 } 105 106 func (task *provisionerTask) Dying() <-chan struct{} { 107 return task.tomb.Dying() 108 } 109 110 func (task *provisionerTask) Err() error { 111 return task.tomb.Err() 112 } 113 114 func (task *provisionerTask) loop() error { 115 logger.Infof("Starting up provisioner task %s", task.machineTag) 116 defer watcher.Stop(task.machineWatcher, &task.tomb) 117 118 // Don't allow the safe mode to change until we have 119 // read at least one set of changes, which will populate 120 // the task.machines map. Otherwise we will potentially 121 // see all legitimate instances as unknown. 122 var safeModeChan chan bool 123 124 // When the watcher is started, it will have the initial changes be all 125 // the machines that are relevant. Also, since this is available straight 126 // away, we know there will be some changes right off the bat. 127 for { 128 select { 129 case <-task.tomb.Dying(): 130 logger.Infof("Shutting down provisioner task %s", task.machineTag) 131 return tomb.ErrDying 132 case ids, ok := <-task.machineWatcher.Changes(): 133 if !ok { 134 return watcher.MustErr(task.machineWatcher) 135 } 136 // TODO(dfc; lp:1042717) fire process machines periodically to shut down unknown 137 // instances. 138 if err := task.processMachines(ids); err != nil { 139 return errors.Notef(err, "failed to process updated machines") 140 } 141 // We've seen a set of changes. Enable safe mode change. 142 safeModeChan = task.safeModeChan 143 case safeMode := <-safeModeChan: 144 if safeMode == task.safeMode { 145 break 146 } 147 logger.Infof("safe mode changed to %v", safeMode) 148 task.safeMode = safeMode 149 if !safeMode { 150 // Safe mode has been disabled, so process current machines 151 // so that unknown machines will be immediately dealt with. 152 if err := task.processMachines(nil); err != nil { 153 return errors.Notef(err, "failed to process machines after safe mode disabled") 154 } 155 } 156 } 157 } 158 } 159 160 // SetSafeMode implements ProvisionerTask.SetSafeMode(). 161 func (task *provisionerTask) SetSafeMode(safeMode bool) { 162 select { 163 case task.safeModeChan <- safeMode: 164 case <-task.Dying(): 165 } 166 } 167 168 func (task *provisionerTask) processMachines(ids []string) error { 169 logger.Tracef("processMachines(%v)", ids) 170 // Populate the tasks maps of current instances and machines. 171 err := task.populateMachineMaps(ids) 172 if err != nil { 173 return mask(err) 174 } 175 176 // Find machines without an instance id or that are dead 177 pending, dead, err := task.pendingOrDead(ids) 178 if err != nil { 179 return mask(err) 180 } 181 182 // Stop all machines that are dead 183 stopping := task.instancesForMachines(dead) 184 185 // Find running instances that have no machines associated 186 unknown, err := task.findUnknownInstances(stopping) 187 if err != nil { 188 return mask(err) 189 } 190 if task.safeMode { 191 logger.Infof("running in safe mode, unknown instances not stopped %v", instanceIds(unknown)) 192 unknown = nil 193 } 194 if len(stopping) > 0 { 195 logger.Infof("stopping known instances %v", stopping) 196 } 197 if len(unknown) > 0 { 198 logger.Infof("stopping unknown instances %v", instanceIds(unknown)) 199 } 200 // It's important that we stop unknown instances before starting 201 // pending ones, because if we start an instance and then fail to 202 // set its InstanceId on the machine we don't want to start a new 203 // instance for the same machine ID. 204 if err := task.stopInstances(append(stopping, unknown...)); err != nil { 205 return mask(err) 206 } 207 208 // Remove any dead machines from state. 209 for _, machine := range dead { 210 logger.Infof("removing dead machine %q", machine) 211 if err := machine.Remove(); err != nil { 212 logger.Errorf("failed to remove dead machine %q", machine) 213 } 214 delete(task.machines, machine.Id()) 215 } 216 217 // Start an instance for the pending ones 218 return task.startMachines(pending) 219 } 220 221 func instanceIds(instances []instance.Instance) []string { 222 ids := make([]string, 0, len(instances)) 223 for _, inst := range instances { 224 ids = append(ids, string(inst.Id())) 225 } 226 return ids 227 } 228 229 func (task *provisionerTask) populateMachineMaps(ids []string) error { 230 task.instances = make(map[instance.Id]instance.Instance) 231 232 instances, err := task.broker.AllInstances() 233 if err != nil { 234 logger.Errorf("failed to get all instances from broker: %v", err) 235 return err 236 } 237 for _, i := range instances { 238 task.instances[i.Id()] = i 239 } 240 241 // Update the machines map with new data for each of the machines in the 242 // change list. 243 // TODO(thumper): update for API server later to get all machines in one go. 244 for _, id := range ids { 245 machineTag := names.MachineTag(id) 246 machine, err := task.machineGetter.Machine(machineTag) 247 switch { 248 case params.IsCodeNotFoundOrCodeUnauthorized(err): 249 logger.Debugf("machine %q not found in state", id) 250 delete(task.machines, id) 251 case err == nil: 252 task.machines[id] = machine 253 default: 254 logger.Errorf("failed to get machine: %v", err) 255 } 256 } 257 return nil 258 } 259 260 // pendingOrDead looks up machines with ids and returns those that do not 261 // have an instance id assigned yet, and also those that are dead. 262 func (task *provisionerTask) pendingOrDead(ids []string) (pending, dead []*apiprovisioner.Machine, err error) { 263 for _, id := range ids { 264 machine, found := task.machines[id] 265 if !found { 266 logger.Infof("machine %q not found", id) 267 continue 268 } 269 switch machine.Life() { 270 case params.Dying: 271 if _, err := machine.InstanceId(); err == nil { 272 continue 273 } else if !params.IsCodeNotProvisioned(err) { 274 logger.Errorf("failed to load machine %q instance id: %v", machine, err) 275 return nil, nil, err 276 } 277 logger.Infof("killing dying, unprovisioned machine %q", machine) 278 if err := machine.EnsureDead(); err != nil { 279 logger.Errorf("failed to ensure machine dead %q: %v", machine, err) 280 return nil, nil, err 281 } 282 fallthrough 283 case params.Dead: 284 dead = append(dead, machine) 285 continue 286 } 287 if instId, err := machine.InstanceId(); err != nil { 288 if !params.IsCodeNotProvisioned(err) { 289 logger.Errorf("failed to load machine %q instance id: %v", machine, err) 290 continue 291 } 292 status, _, err := machine.Status() 293 if err != nil { 294 logger.Infof("cannot get machine %q status: %v", machine, err) 295 continue 296 } 297 if status == params.StatusPending { 298 pending = append(pending, machine) 299 logger.Infof("found machine %q pending provisioning", machine) 300 continue 301 } 302 } else { 303 logger.Infof("machine %v already started as instance %q", machine, instId) 304 } 305 } 306 logger.Tracef("pending machines: %v", pending) 307 logger.Tracef("dead machines: %v", dead) 308 return 309 } 310 311 // findUnknownInstances finds instances which are not associated with a machine. 312 func (task *provisionerTask) findUnknownInstances(stopping []instance.Instance) ([]instance.Instance, error) { 313 // Make a copy of the instances we know about. 314 instances := make(map[instance.Id]instance.Instance) 315 for k, v := range task.instances { 316 instances[k] = v 317 } 318 319 for _, m := range task.machines { 320 instId, err := m.InstanceId() 321 switch { 322 case err == nil: 323 delete(instances, instId) 324 case params.IsCodeNotProvisioned(err): 325 case params.IsCodeNotFoundOrCodeUnauthorized(err): 326 default: 327 return nil, err 328 } 329 } 330 // Now remove all those instances that we are stopping already as we 331 // know about those and don't want to include them in the unknown list. 332 for _, inst := range stopping { 333 delete(instances, inst.Id()) 334 } 335 var unknown []instance.Instance 336 for _, inst := range instances { 337 unknown = append(unknown, inst) 338 } 339 return unknown, nil 340 } 341 342 // instancesForMachines returns a list of instance.Instance that represent 343 // the list of machines running in the provider. Missing machines are 344 // omitted from the list. 345 func (task *provisionerTask) instancesForMachines(machines []*apiprovisioner.Machine) []instance.Instance { 346 var instances []instance.Instance 347 for _, machine := range machines { 348 instId, err := machine.InstanceId() 349 if err == nil { 350 instance, found := task.instances[instId] 351 // If the instance is not found we can't stop it. 352 if found { 353 instances = append(instances, instance) 354 } 355 } 356 } 357 return instances 358 } 359 360 func (task *provisionerTask) stopInstances(instances []instance.Instance) error { 361 // Although calling StopInstance with an empty slice should produce no change in the 362 // provider, environs like dummy do not consider this a noop. 363 if len(instances) == 0 { 364 return nil 365 } 366 if err := task.broker.StopInstances(instances); err != nil { 367 logger.Errorf("broker failed to stop instances: %v", err) 368 return err 369 } 370 return nil 371 } 372 373 func (task *provisionerTask) startMachines(machines []*apiprovisioner.Machine) error { 374 for _, m := range machines { 375 if err := task.startMachine(m); err != nil { 376 return errors.Notef(err, "cannot start machine %v", m) 377 } 378 } 379 return nil 380 } 381 382 func (task *provisionerTask) startMachine(machine *apiprovisioner.Machine) error { 383 cons, err := machine.Constraints() 384 if err != nil { 385 return mask(err) 386 } 387 series, err := machine.Series() 388 if err != nil { 389 return mask(err) 390 } 391 possibleTools, err := task.possibleTools(series, cons) 392 if err != nil { 393 return mask(err) 394 } 395 machineConfig, err := task.machineConfig(machine) 396 if err != nil { 397 return mask(err) 398 } 399 inst, metadata, err := task.broker.StartInstance(cons, possibleTools, machineConfig) 400 if err != nil { 401 // Set the state to error, so the machine will be skipped next 402 // time until the error is resolved, but don't return an 403 // error; just keep going with the other machines. 404 logger.Errorf("cannot start instance for machine %q: %v", machine, err) 405 if err1 := machine.SetStatus(params.StatusError, err.Error()); err1 != nil { 406 // Something is wrong with this machine, better report it back. 407 logger.Errorf("cannot set error status for machine %q: %v", machine, err1) 408 return err1 409 } 410 return nil 411 } 412 nonce := machineConfig.MachineNonce 413 if err := machine.SetProvisioned(inst.Id(), nonce, metadata); err != nil { 414 logger.Errorf("cannot register instance for machine %v: %v", machine, err) 415 // The machine is started, but we can't record the mapping in 416 // state. It'll keep running while we fail out and restart, 417 // but will then be detected by findUnknownInstances and 418 // killed again. 419 // 420 // TODO(dimitern) Stop the instance right away here. 421 // 422 // Multiple instantiations of a given machine (with the same 423 // machine ID) cannot coexist, because findUnknownInstances is 424 // called before startMachines. However, if the first machine 425 // had started to do work before being replaced, we may 426 // encounter surprising problems. 427 return err 428 } 429 logger.Infof("started machine %s as instance %s with hardware %q", machine, inst.Id(), metadata) 430 return nil 431 } 432 433 func (task *provisionerTask) possibleTools(series string, cons constraints.Value) (coretools.List, error) { 434 if env, ok := task.broker.(environs.Environ); ok { 435 agentVersion, ok := env.Config().AgentVersion() 436 if !ok { 437 return nil, errors.Newf("no agent version set in environment configuration") 438 } 439 return tools.FindInstanceTools(env, agentVersion, series, cons.Arch) 440 } 441 if hasTools, ok := task.broker.(coretools.HasTools); ok { 442 return hasTools.Tools(), nil 443 } 444 panic(errors.Newf("broker of type %T does not provide any tools", task.broker)) 445 } 446 447 func (task *provisionerTask) machineConfig(machine *apiprovisioner.Machine) (*cloudinit.MachineConfig, error) { 448 stateInfo, apiInfo, err := task.auth.SetupAuthentication(machine) 449 if err != nil { 450 logger.Errorf("failed to setup authentication: %v", err) 451 return nil, err 452 } 453 // Generated a nonce for the new instance, with the format: "machine-#:UUID". 454 // The first part is a badge, specifying the tag of the machine the provisioner 455 // is running on, while the second part is a random UUID. 456 uuid, err := utils.NewUUID() 457 if err != nil { 458 return nil, mask(err) 459 } 460 nonce := fmt.Sprintf("%s:%s", task.machineTag, uuid.String()) 461 machineConfig := environs.NewMachineConfig(machine.Id(), nonce, stateInfo, apiInfo) 462 return machineConfig, nil 463 }