github.com/cloudbase/juju-core@v0.0.0-20140504232958-a7271ac7912f/worker/provisioner/provisioner_task.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package provisioner 5 6 import ( 7 "fmt" 8 9 "launchpad.net/tomb" 10 11 "launchpad.net/juju-core/constraints" 12 "launchpad.net/juju-core/environs" 13 "launchpad.net/juju-core/environs/cloudinit" 14 "launchpad.net/juju-core/environs/tools" 15 "launchpad.net/juju-core/instance" 16 "launchpad.net/juju-core/names" 17 "launchpad.net/juju-core/state/api/params" 18 apiprovisioner "launchpad.net/juju-core/state/api/provisioner" 19 "launchpad.net/juju-core/state/watcher" 20 coretools "launchpad.net/juju-core/tools" 21 "launchpad.net/juju-core/utils" 22 "launchpad.net/juju-core/worker" 23 ) 24 25 type ProvisionerTask interface { 26 worker.Worker 27 Stop() error 28 Dying() <-chan struct{} 29 Err() error 30 31 // SetSafeMode sets a flag to indicate whether the provisioner task 32 // runs in safe mode or not. In safe mode, any running instances 33 // which do no exist in state are allowed to keep running rather than 34 // being shut down. 35 SetSafeMode(safeMode bool) 36 } 37 38 type Watcher interface { 39 watcher.Errer 40 watcher.Stopper 41 Changes() <-chan []string 42 } 43 44 type MachineGetter interface { 45 Machine(tag string) (*apiprovisioner.Machine, error) 46 } 47 48 func NewProvisionerTask( 49 machineTag string, 50 safeMode bool, 51 machineGetter MachineGetter, 52 watcher Watcher, 53 broker environs.InstanceBroker, 54 auth environs.AuthenticationProvider, 55 ) ProvisionerTask { 56 task := &provisionerTask{ 57 machineTag: machineTag, 58 machineGetter: machineGetter, 59 machineWatcher: watcher, 60 broker: broker, 61 auth: auth, 62 safeMode: safeMode, 63 safeModeChan: make(chan bool, 1), 64 machines: make(map[string]*apiprovisioner.Machine), 65 } 66 go func() { 67 defer task.tomb.Done() 68 task.tomb.Kill(task.loop()) 69 }() 70 return task 71 } 72 73 type provisionerTask struct { 74 machineTag string 75 machineGetter MachineGetter 76 machineWatcher Watcher 77 broker environs.InstanceBroker 78 tomb tomb.Tomb 79 auth environs.AuthenticationProvider 80 81 safeMode bool 82 safeModeChan chan bool 83 84 // instance id -> instance 85 instances map[instance.Id]instance.Instance 86 // machine id -> machine 87 machines map[string]*apiprovisioner.Machine 88 } 89 90 // Kill implements worker.Worker.Kill. 91 func (task *provisionerTask) Kill() { 92 task.tomb.Kill(nil) 93 } 94 95 // Wait implements worker.Worker.Wait. 96 func (task *provisionerTask) Wait() error { 97 return task.tomb.Wait() 98 } 99 100 func (task *provisionerTask) Stop() error { 101 task.Kill() 102 return task.Wait() 103 } 104 105 func (task *provisionerTask) Dying() <-chan struct{} { 106 return task.tomb.Dying() 107 } 108 109 func (task *provisionerTask) Err() error { 110 return task.tomb.Err() 111 } 112 113 func (task *provisionerTask) loop() error { 114 logger.Infof("Starting up provisioner task %s", task.machineTag) 115 defer watcher.Stop(task.machineWatcher, &task.tomb) 116 117 // Don't allow the safe mode to change until we have 118 // read at least one set of changes, which will populate 119 // the task.machines map. Otherwise we will potentially 120 // see all legitimate instances as unknown. 121 var safeModeChan chan bool 122 123 // When the watcher is started, it will have the initial changes be all 124 // the machines that are relevant. Also, since this is available straight 125 // away, we know there will be some changes right off the bat. 126 for { 127 select { 128 case <-task.tomb.Dying(): 129 logger.Infof("Shutting down provisioner task %s", task.machineTag) 130 return tomb.ErrDying 131 case ids, ok := <-task.machineWatcher.Changes(): 132 if !ok { 133 return watcher.MustErr(task.machineWatcher) 134 } 135 // TODO(dfc; lp:1042717) fire process machines periodically to shut down unknown 136 // instances. 137 if err := task.processMachines(ids); err != nil { 138 return fmt.Errorf("failed to process updated machines: %v", err) 139 } 140 // We've seen a set of changes. Enable safe mode change. 141 safeModeChan = task.safeModeChan 142 case safeMode := <-safeModeChan: 143 if safeMode == task.safeMode { 144 break 145 } 146 logger.Infof("safe mode changed to %v", safeMode) 147 task.safeMode = safeMode 148 if !safeMode { 149 // Safe mode has been disabled, so process current machines 150 // so that unknown machines will be immediately dealt with. 151 if err := task.processMachines(nil); err != nil { 152 return fmt.Errorf("failed to process machines after safe mode disabled: %v", err) 153 } 154 } 155 } 156 } 157 } 158 159 // SetSafeMode implements ProvisionerTask.SetSafeMode(). 160 func (task *provisionerTask) SetSafeMode(safeMode bool) { 161 select { 162 case task.safeModeChan <- safeMode: 163 case <-task.Dying(): 164 } 165 } 166 167 func (task *provisionerTask) processMachines(ids []string) error { 168 logger.Tracef("processMachines(%v)", ids) 169 // Populate the tasks maps of current instances and machines. 170 err := task.populateMachineMaps(ids) 171 if err != nil { 172 return err 173 } 174 175 // Find machines without an instance id or that are dead 176 pending, dead, err := task.pendingOrDead(ids) 177 if err != nil { 178 return err 179 } 180 181 // Stop all machines that are dead 182 stopping := task.instancesForMachines(dead) 183 184 // Find running instances that have no machines associated 185 unknown, err := task.findUnknownInstances(stopping) 186 if err != nil { 187 return err 188 } 189 if task.safeMode { 190 logger.Infof("running in safe mode, unknown instances not stopped %v", instanceIds(unknown)) 191 unknown = nil 192 } 193 if len(stopping) > 0 { 194 logger.Infof("stopping known instances %v", stopping) 195 } 196 if len(unknown) > 0 { 197 logger.Infof("stopping unknown instances %v", instanceIds(unknown)) 198 } 199 // It's important that we stop unknown instances before starting 200 // pending ones, because if we start an instance and then fail to 201 // set its InstanceId on the machine we don't want to start a new 202 // instance for the same machine ID. 203 if err := task.stopInstances(append(stopping, unknown...)); err != nil { 204 return err 205 } 206 207 // Remove any dead machines from state. 208 for _, machine := range dead { 209 logger.Infof("removing dead machine %q", machine) 210 if err := machine.Remove(); err != nil { 211 logger.Errorf("failed to remove dead machine %q", machine) 212 } 213 delete(task.machines, machine.Id()) 214 } 215 216 // Start an instance for the pending ones 217 return task.startMachines(pending) 218 } 219 220 func instanceIds(instances []instance.Instance) []string { 221 ids := make([]string, 0, len(instances)) 222 for _, inst := range instances { 223 ids = append(ids, string(inst.Id())) 224 } 225 return ids 226 } 227 228 func (task *provisionerTask) populateMachineMaps(ids []string) error { 229 task.instances = make(map[instance.Id]instance.Instance) 230 231 instances, err := task.broker.AllInstances() 232 if err != nil { 233 logger.Errorf("failed to get all instances from broker: %v", err) 234 return err 235 } 236 for _, i := range instances { 237 task.instances[i.Id()] = i 238 } 239 240 // Update the machines map with new data for each of the machines in the 241 // change list. 242 // TODO(thumper): update for API server later to get all machines in one go. 243 for _, id := range ids { 244 machineTag := names.MachineTag(id) 245 machine, err := task.machineGetter.Machine(machineTag) 246 switch { 247 case params.IsCodeNotFoundOrCodeUnauthorized(err): 248 logger.Debugf("machine %q not found in state", id) 249 delete(task.machines, id) 250 case err == nil: 251 task.machines[id] = machine 252 default: 253 logger.Errorf("failed to get machine: %v", err) 254 } 255 } 256 return nil 257 } 258 259 // pendingOrDead looks up machines with ids and returns those that do not 260 // have an instance id assigned yet, and also those that are dead. 261 func (task *provisionerTask) pendingOrDead(ids []string) (pending, dead []*apiprovisioner.Machine, err error) { 262 for _, id := range ids { 263 machine, found := task.machines[id] 264 if !found { 265 logger.Infof("machine %q not found", id) 266 continue 267 } 268 switch machine.Life() { 269 case params.Dying: 270 if _, err := machine.InstanceId(); err == nil { 271 continue 272 } else if !params.IsCodeNotProvisioned(err) { 273 logger.Errorf("failed to load machine %q instance id: %v", machine, err) 274 return nil, nil, err 275 } 276 logger.Infof("killing dying, unprovisioned machine %q", machine) 277 if err := machine.EnsureDead(); err != nil { 278 logger.Errorf("failed to ensure machine dead %q: %v", machine, err) 279 return nil, nil, err 280 } 281 fallthrough 282 case params.Dead: 283 dead = append(dead, machine) 284 continue 285 } 286 if instId, err := machine.InstanceId(); err != nil { 287 if !params.IsCodeNotProvisioned(err) { 288 logger.Errorf("failed to load machine %q instance id: %v", machine, err) 289 continue 290 } 291 status, _, err := machine.Status() 292 if err != nil { 293 logger.Infof("cannot get machine %q status: %v", machine, err) 294 continue 295 } 296 if status == params.StatusPending { 297 pending = append(pending, machine) 298 logger.Infof("found machine %q pending provisioning", machine) 299 continue 300 } 301 } else { 302 logger.Infof("machine %v already started as instance %q", machine, instId) 303 } 304 } 305 logger.Tracef("pending machines: %v", pending) 306 logger.Tracef("dead machines: %v", dead) 307 return 308 } 309 310 // findUnknownInstances finds instances which are not associated with a machine. 311 func (task *provisionerTask) findUnknownInstances(stopping []instance.Instance) ([]instance.Instance, error) { 312 // Make a copy of the instances we know about. 313 instances := make(map[instance.Id]instance.Instance) 314 for k, v := range task.instances { 315 instances[k] = v 316 } 317 318 for _, m := range task.machines { 319 instId, err := m.InstanceId() 320 switch { 321 case err == nil: 322 delete(instances, instId) 323 case params.IsCodeNotProvisioned(err): 324 case params.IsCodeNotFoundOrCodeUnauthorized(err): 325 default: 326 return nil, err 327 } 328 } 329 // Now remove all those instances that we are stopping already as we 330 // know about those and don't want to include them in the unknown list. 331 for _, inst := range stopping { 332 delete(instances, inst.Id()) 333 } 334 var unknown []instance.Instance 335 for _, inst := range instances { 336 unknown = append(unknown, inst) 337 } 338 return unknown, nil 339 } 340 341 // instancesForMachines returns a list of instance.Instance that represent 342 // the list of machines running in the provider. Missing machines are 343 // omitted from the list. 344 func (task *provisionerTask) instancesForMachines(machines []*apiprovisioner.Machine) []instance.Instance { 345 var instances []instance.Instance 346 for _, machine := range machines { 347 instId, err := machine.InstanceId() 348 if err == nil { 349 instance, found := task.instances[instId] 350 // If the instance is not found we can't stop it. 351 if found { 352 instances = append(instances, instance) 353 } 354 } 355 } 356 return instances 357 } 358 359 func (task *provisionerTask) stopInstances(instances []instance.Instance) error { 360 // Although calling StopInstance with an empty slice should produce no change in the 361 // provider, environs like dummy do not consider this a noop. 362 if len(instances) == 0 { 363 return nil 364 } 365 if err := task.broker.StopInstances(instances); err != nil { 366 logger.Errorf("broker failed to stop instances: %v", err) 367 return err 368 } 369 return nil 370 } 371 372 func (task *provisionerTask) startMachines(machines []*apiprovisioner.Machine) error { 373 for _, m := range machines { 374 if err := task.startMachine(m); err != nil { 375 return fmt.Errorf("cannot start machine %v: %v", m, err) 376 } 377 } 378 return nil 379 } 380 381 func (task *provisionerTask) startMachine(machine *apiprovisioner.Machine) error { 382 cons, err := machine.Constraints() 383 if err != nil { 384 return err 385 } 386 series, err := machine.Series() 387 if err != nil { 388 return err 389 } 390 possibleTools, err := task.possibleTools(series, cons) 391 if err != nil { 392 return err 393 } 394 machineConfig, err := task.machineConfig(machine) 395 if err != nil { 396 return err 397 } 398 inst, metadata, err := task.broker.StartInstance(cons, possibleTools, machineConfig) 399 if err != nil { 400 // Set the state to error, so the machine will be skipped next 401 // time until the error is resolved, but don't return an 402 // error; just keep going with the other machines. 403 logger.Errorf("cannot start instance for machine %q: %v", machine, err) 404 if err1 := machine.SetStatus(params.StatusError, err.Error()); err1 != nil { 405 // Something is wrong with this machine, better report it back. 406 logger.Errorf("cannot set error status for machine %q: %v", machine, err1) 407 return err1 408 } 409 return nil 410 } 411 nonce := machineConfig.MachineNonce 412 if err := machine.SetProvisioned(inst.Id(), nonce, metadata); err != nil { 413 logger.Errorf("cannot register instance for machine %v: %v", machine, err) 414 // The machine is started, but we can't record the mapping in 415 // state. It'll keep running while we fail out and restart, 416 // but will then be detected by findUnknownInstances and 417 // killed again. 418 // 419 // TODO(dimitern) Stop the instance right away here. 420 // 421 // Multiple instantiations of a given machine (with the same 422 // machine ID) cannot coexist, because findUnknownInstances is 423 // called before startMachines. However, if the first machine 424 // had started to do work before being replaced, we may 425 // encounter surprising problems. 426 return err 427 } 428 logger.Infof("started machine %s as instance %s with hardware %q", machine, inst.Id(), metadata) 429 return nil 430 } 431 432 func (task *provisionerTask) possibleTools(series string, cons constraints.Value) (coretools.List, error) { 433 if env, ok := task.broker.(environs.Environ); ok { 434 agentVersion, ok := env.Config().AgentVersion() 435 if !ok { 436 return nil, fmt.Errorf("no agent version set in environment configuration") 437 } 438 return tools.FindInstanceTools(env, agentVersion, series, cons.Arch) 439 } 440 if hasTools, ok := task.broker.(coretools.HasTools); ok { 441 return hasTools.Tools(), nil 442 } 443 panic(fmt.Errorf("broker of type %T does not provide any tools", task.broker)) 444 } 445 446 func (task *provisionerTask) machineConfig(machine *apiprovisioner.Machine) (*cloudinit.MachineConfig, error) { 447 stateInfo, apiInfo, err := task.auth.SetupAuthentication(machine) 448 if err != nil { 449 logger.Errorf("failed to setup authentication: %v", err) 450 return nil, err 451 } 452 // Generated a nonce for the new instance, with the format: "machine-#:UUID". 453 // The first part is a badge, specifying the tag of the machine the provisioner 454 // is running on, while the second part is a random UUID. 455 uuid, err := utils.NewUUID() 456 if err != nil { 457 return nil, err 458 } 459 nonce := fmt.Sprintf("%s:%s", task.machineTag, uuid.String()) 460 serie, err := machine.Series() 461 if err != nil { 462 return nil, err 463 } 464 machineConfig := environs.NewMachineConfig(machine.Id(), nonce, serie, stateInfo, apiInfo) 465 return machineConfig, nil 466 }