github.com/mattyw/juju@v0.0.0-20140610034352-732aecd63861/worker/provisioner/provisioner_task.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package provisioner 5 6 import ( 7 "fmt" 8 "time" 9 10 "github.com/juju/names" 11 "github.com/juju/utils" 12 "github.com/juju/utils/set" 13 "launchpad.net/tomb" 14 15 "github.com/juju/juju/constraints" 16 "github.com/juju/juju/environs" 17 "github.com/juju/juju/environs/cloudinit" 18 "github.com/juju/juju/environs/network" 19 "github.com/juju/juju/environs/tools" 20 "github.com/juju/juju/instance" 21 "github.com/juju/juju/state/api/params" 22 apiprovisioner "github.com/juju/juju/state/api/provisioner" 23 apiwatcher "github.com/juju/juju/state/api/watcher" 24 "github.com/juju/juju/state/watcher" 25 coretools "github.com/juju/juju/tools" 26 "github.com/juju/juju/worker" 27 ) 28 29 type ProvisionerTask interface { 30 worker.Worker 31 Stop() error 32 Dying() <-chan struct{} 33 Err() error 34 35 // SetSafeMode sets a flag to indicate whether the provisioner task 36 // runs in safe mode or not. In safe mode, any running instances 37 // which do no exist in state are allowed to keep running rather than 38 // being shut down. 39 SetSafeMode(safeMode bool) 40 } 41 42 type MachineGetter interface { 43 Machine(tag string) (*apiprovisioner.Machine, error) 44 MachinesWithTransientErrors() ([]*apiprovisioner.Machine, []params.StatusResult, error) 45 } 46 47 var _ MachineGetter = (*apiprovisioner.State)(nil) 48 49 func NewProvisionerTask( 50 machineTag string, 51 safeMode bool, 52 machineGetter MachineGetter, 53 machineWatcher apiwatcher.StringsWatcher, 54 retryWatcher apiwatcher.NotifyWatcher, 55 broker environs.InstanceBroker, 56 auth environs.AuthenticationProvider, 57 ) ProvisionerTask { 58 task := &provisionerTask{ 59 machineTag: machineTag, 60 machineGetter: machineGetter, 61 machineWatcher: machineWatcher, 62 retryWatcher: retryWatcher, 63 broker: broker, 64 auth: auth, 65 safeMode: safeMode, 66 safeModeChan: make(chan bool, 1), 67 machines: make(map[string]*apiprovisioner.Machine), 68 } 69 go func() { 70 defer task.tomb.Done() 71 task.tomb.Kill(task.loop()) 72 }() 73 return task 74 } 75 76 type provisionerTask struct { 77 machineTag string 78 machineGetter MachineGetter 79 machineWatcher apiwatcher.StringsWatcher 80 retryWatcher apiwatcher.NotifyWatcher 81 broker environs.InstanceBroker 82 tomb tomb.Tomb 83 auth environs.AuthenticationProvider 84 85 safeMode bool 86 safeModeChan chan bool 87 88 // instance id -> instance 89 instances map[instance.Id]instance.Instance 90 // machine id -> machine 91 machines map[string]*apiprovisioner.Machine 92 } 93 94 // Kill implements worker.Worker.Kill. 95 func (task *provisionerTask) Kill() { 96 task.tomb.Kill(nil) 97 } 98 99 // Wait implements worker.Worker.Wait. 100 func (task *provisionerTask) Wait() error { 101 return task.tomb.Wait() 102 } 103 104 func (task *provisionerTask) Stop() error { 105 task.Kill() 106 return task.Wait() 107 } 108 109 func (task *provisionerTask) Dying() <-chan struct{} { 110 return task.tomb.Dying() 111 } 112 113 func (task *provisionerTask) Err() error { 114 return task.tomb.Err() 115 } 116 117 func (task *provisionerTask) loop() error { 118 logger.Infof("Starting up provisioner task %s", task.machineTag) 119 defer watcher.Stop(task.machineWatcher, &task.tomb) 120 121 // Don't allow the safe mode to change until we have 122 // read at least one set of changes, which will populate 123 // the task.machines map. Otherwise we will potentially 124 // see all legitimate instances as unknown. 125 var safeModeChan chan bool 126 127 // Not all provisioners have a retry channel. 128 var retryChan <-chan struct{} 129 if task.retryWatcher != nil { 130 retryChan = task.retryWatcher.Changes() 131 } 132 133 // When the watcher is started, it will have the initial changes be all 134 // the machines that are relevant. Also, since this is available straight 135 // away, we know there will be some changes right off the bat. 136 for { 137 select { 138 case <-task.tomb.Dying(): 139 logger.Infof("Shutting down provisioner task %s", task.machineTag) 140 return tomb.ErrDying 141 case ids, ok := <-task.machineWatcher.Changes(): 142 if !ok { 143 return watcher.MustErr(task.machineWatcher) 144 } 145 if err := task.processMachines(ids); err != nil { 146 return fmt.Errorf("failed to process updated machines: %v", err) 147 } 148 // We've seen a set of changes. Enable safe mode change. 149 safeModeChan = task.safeModeChan 150 case safeMode := <-safeModeChan: 151 if safeMode == task.safeMode { 152 break 153 } 154 logger.Infof("safe mode changed to %v", safeMode) 155 task.safeMode = safeMode 156 if !safeMode { 157 // Safe mode has been disabled, so process current machines 158 // so that unknown machines will be immediately dealt with. 159 if err := task.processMachines(nil); err != nil { 160 return fmt.Errorf("failed to process machines after safe mode disabled: %v", err) 161 } 162 } 163 case <-retryChan: 164 if err := task.processMachinesWithTransientErrors(); err != nil { 165 return fmt.Errorf("failed to process machines with transient errors: %v", err) 166 } 167 } 168 } 169 } 170 171 // SetSafeMode implements ProvisionerTask.SetSafeMode(). 172 func (task *provisionerTask) SetSafeMode(safeMode bool) { 173 select { 174 case task.safeModeChan <- safeMode: 175 case <-task.Dying(): 176 } 177 } 178 179 func (task *provisionerTask) processMachinesWithTransientErrors() error { 180 machines, statusResults, err := task.machineGetter.MachinesWithTransientErrors() 181 if err != nil { 182 return nil 183 } 184 logger.Tracef("processMachinesWithTransientErrors(%v)", statusResults) 185 var pending []*apiprovisioner.Machine 186 for i, status := range statusResults { 187 if status.Error != nil { 188 logger.Errorf("cannot retry provisioning of machine %q: %v", status.Id, status.Error) 189 continue 190 } 191 machine := machines[i] 192 if err := machine.SetStatus(params.StatusPending, "", nil); err != nil { 193 logger.Errorf("cannot reset status of machine %q: %v", status.Id, err) 194 continue 195 } 196 task.machines[machine.Tag()] = machine 197 pending = append(pending, machine) 198 } 199 return task.startMachines(pending) 200 } 201 202 func (task *provisionerTask) processMachines(ids []string) error { 203 logger.Tracef("processMachines(%v)", ids) 204 // Populate the tasks maps of current instances and machines. 205 err := task.populateMachineMaps(ids) 206 if err != nil { 207 return err 208 } 209 210 // Find machines without an instance id or that are dead 211 pending, dead, err := task.pendingOrDead(ids) 212 if err != nil { 213 return err 214 } 215 216 // Stop all machines that are dead 217 stopping := task.instancesForMachines(dead) 218 219 // Find running instances that have no machines associated 220 unknown, err := task.findUnknownInstances(stopping) 221 if err != nil { 222 return err 223 } 224 if task.safeMode { 225 logger.Infof("running in safe mode, unknown instances not stopped %v", instanceIds(unknown)) 226 unknown = nil 227 } 228 if len(stopping) > 0 { 229 logger.Infof("stopping known instances %v", stopping) 230 } 231 if len(unknown) > 0 { 232 logger.Infof("stopping unknown instances %v", instanceIds(unknown)) 233 } 234 // It's important that we stop unknown instances before starting 235 // pending ones, because if we start an instance and then fail to 236 // set its InstanceId on the machine we don't want to start a new 237 // instance for the same machine ID. 238 if err := task.stopInstances(append(stopping, unknown...)); err != nil { 239 return err 240 } 241 242 // Remove any dead machines from state. 243 for _, machine := range dead { 244 logger.Infof("removing dead machine %q", machine) 245 if err := machine.Remove(); err != nil { 246 logger.Errorf("failed to remove dead machine %q", machine) 247 } 248 delete(task.machines, machine.Id()) 249 } 250 251 // Start an instance for the pending ones 252 return task.startMachines(pending) 253 } 254 255 func instanceIds(instances []instance.Instance) []string { 256 ids := make([]string, 0, len(instances)) 257 for _, inst := range instances { 258 ids = append(ids, string(inst.Id())) 259 } 260 return ids 261 } 262 263 func (task *provisionerTask) populateMachineMaps(ids []string) error { 264 task.instances = make(map[instance.Id]instance.Instance) 265 266 instances, err := task.broker.AllInstances() 267 if err != nil { 268 logger.Errorf("failed to get all instances from broker: %v", err) 269 return err 270 } 271 for _, i := range instances { 272 task.instances[i.Id()] = i 273 } 274 275 // Update the machines map with new data for each of the machines in the 276 // change list. 277 // TODO(thumper): update for API server later to get all machines in one go. 278 for _, id := range ids { 279 machineTag := names.MachineTag(id) 280 machine, err := task.machineGetter.Machine(machineTag) 281 switch { 282 case params.IsCodeNotFoundOrCodeUnauthorized(err): 283 logger.Debugf("machine %q not found in state", id) 284 delete(task.machines, id) 285 case err == nil: 286 task.machines[id] = machine 287 default: 288 logger.Errorf("failed to get machine: %v", err) 289 } 290 } 291 return nil 292 } 293 294 // pendingOrDead looks up machines with ids and returns those that do not 295 // have an instance id assigned yet, and also those that are dead. 296 func (task *provisionerTask) pendingOrDead(ids []string) (pending, dead []*apiprovisioner.Machine, err error) { 297 for _, id := range ids { 298 machine, found := task.machines[id] 299 if !found { 300 logger.Infof("machine %q not found", id) 301 continue 302 } 303 switch machine.Life() { 304 case params.Dying: 305 if _, err := machine.InstanceId(); err == nil { 306 continue 307 } else if !params.IsCodeNotProvisioned(err) { 308 logger.Errorf("failed to load machine %q instance id: %v", machine, err) 309 return nil, nil, err 310 } 311 logger.Infof("killing dying, unprovisioned machine %q", machine) 312 if err := machine.EnsureDead(); err != nil { 313 logger.Errorf("failed to ensure machine dead %q: %v", machine, err) 314 return nil, nil, err 315 } 316 fallthrough 317 case params.Dead: 318 dead = append(dead, machine) 319 continue 320 } 321 if instId, err := machine.InstanceId(); err != nil { 322 if !params.IsCodeNotProvisioned(err) { 323 logger.Errorf("failed to load machine %q instance id: %v", machine, err) 324 continue 325 } 326 status, _, err := machine.Status() 327 if err != nil { 328 logger.Infof("cannot get machine %q status: %v", machine, err) 329 continue 330 } 331 if status == params.StatusPending { 332 pending = append(pending, machine) 333 logger.Infof("found machine %q pending provisioning", machine) 334 continue 335 } 336 } else { 337 logger.Infof("machine %v already started as instance %q", machine, instId) 338 } 339 } 340 logger.Tracef("pending machines: %v", pending) 341 logger.Tracef("dead machines: %v", dead) 342 return 343 } 344 345 // findUnknownInstances finds instances which are not associated with a machine. 346 func (task *provisionerTask) findUnknownInstances(stopping []instance.Instance) ([]instance.Instance, error) { 347 // Make a copy of the instances we know about. 348 instances := make(map[instance.Id]instance.Instance) 349 for k, v := range task.instances { 350 instances[k] = v 351 } 352 353 for _, m := range task.machines { 354 instId, err := m.InstanceId() 355 switch { 356 case err == nil: 357 delete(instances, instId) 358 case params.IsCodeNotProvisioned(err): 359 case params.IsCodeNotFoundOrCodeUnauthorized(err): 360 default: 361 return nil, err 362 } 363 } 364 // Now remove all those instances that we are stopping already as we 365 // know about those and don't want to include them in the unknown list. 366 for _, inst := range stopping { 367 delete(instances, inst.Id()) 368 } 369 var unknown []instance.Instance 370 for _, inst := range instances { 371 unknown = append(unknown, inst) 372 } 373 return unknown, nil 374 } 375 376 // instancesForMachines returns a list of instance.Instance that represent 377 // the list of machines running in the provider. Missing machines are 378 // omitted from the list. 379 func (task *provisionerTask) instancesForMachines(machines []*apiprovisioner.Machine) []instance.Instance { 380 var instances []instance.Instance 381 for _, machine := range machines { 382 instId, err := machine.InstanceId() 383 if err == nil { 384 instance, found := task.instances[instId] 385 // If the instance is not found we can't stop it. 386 if found { 387 instances = append(instances, instance) 388 } 389 } 390 } 391 return instances 392 } 393 394 func (task *provisionerTask) stopInstances(instances []instance.Instance) error { 395 // Although calling StopInstance with an empty slice should produce no change in the 396 // provider, environs like dummy do not consider this a noop. 397 if len(instances) == 0 { 398 return nil 399 } 400 ids := make([]instance.Id, len(instances)) 401 for i, inst := range instances { 402 ids[i] = inst.Id() 403 } 404 if err := task.broker.StopInstances(ids...); err != nil { 405 logger.Errorf("broker failed to stop instances: %v", err) 406 return err 407 } 408 return nil 409 } 410 411 func (task *provisionerTask) startMachines(machines []*apiprovisioner.Machine) error { 412 for _, m := range machines { 413 if err := task.startMachine(m); err != nil { 414 return fmt.Errorf("cannot start machine %v: %v", m, err) 415 } 416 } 417 return nil 418 } 419 420 func (task *provisionerTask) setErrorStatus(message string, machine *apiprovisioner.Machine, err error) error { 421 logger.Errorf(message, machine, err) 422 if err1 := machine.SetStatus(params.StatusError, err.Error(), nil); err1 != nil { 423 // Something is wrong with this machine, better report it back. 424 logger.Errorf("cannot set error status for machine %q: %v", machine, err1) 425 return err1 426 } 427 return nil 428 } 429 430 func (task *provisionerTask) prepareNetworkAndInterfaces(networkInfo []network.Info) ( 431 networks []params.Network, ifaces []params.NetworkInterface) { 432 if len(networkInfo) == 0 { 433 return nil, nil 434 } 435 visitedNetworks := set.NewStrings() 436 for _, info := range networkInfo { 437 networkTag := names.NetworkTag(info.NetworkName) 438 if !visitedNetworks.Contains(networkTag) { 439 networks = append(networks, params.Network{ 440 Tag: networkTag, 441 ProviderId: info.ProviderId, 442 CIDR: info.CIDR, 443 VLANTag: info.VLANTag, 444 }) 445 visitedNetworks.Add(networkTag) 446 } 447 ifaces = append(ifaces, params.NetworkInterface{ 448 InterfaceName: info.InterfaceName, 449 MACAddress: info.MACAddress, 450 NetworkTag: networkTag, 451 IsVirtual: info.IsVirtual, 452 }) 453 } 454 return networks, ifaces 455 } 456 457 func (task *provisionerTask) startMachine(machine *apiprovisioner.Machine) error { 458 provisioningInfo, err := task.provisioningInfo(machine) 459 if err != nil { 460 return err 461 } 462 possibleTools, err := task.possibleTools(provisioningInfo.Series, provisioningInfo.Constraints) 463 if err != nil { 464 return task.setErrorStatus("cannot find tools for machine %q: %v", machine, err) 465 } 466 inst, metadata, networkInfo, err := task.broker.StartInstance(environs.StartInstanceParams{ 467 Constraints: provisioningInfo.Constraints, 468 Tools: possibleTools, 469 MachineConfig: provisioningInfo.MachineConfig, 470 Placement: provisioningInfo.Placement, 471 DistributionGroup: machine.DistributionGroup, 472 }) 473 if err != nil { 474 // Set the state to error, so the machine will be skipped next 475 // time until the error is resolved, but don't return an 476 // error; just keep going with the other machines. 477 return task.setErrorStatus("cannot start instance for machine %q: %v", machine, err) 478 } 479 nonce := provisioningInfo.MachineConfig.MachineNonce 480 networks, ifaces := task.prepareNetworkAndInterfaces(networkInfo) 481 482 err = machine.SetInstanceInfo(inst.Id(), nonce, metadata, networks, ifaces) 483 if err != nil && params.IsCodeNotImplemented(err) { 484 return fmt.Errorf("cannot provision instance %v for machine %q with networks: not implemented", inst.Id(), machine) 485 } else if err == nil { 486 logger.Infof("started machine %s as instance %s with hardware %q, networks %v, interfaces %v", machine, inst.Id(), metadata, networks, ifaces) 487 return nil 488 } 489 // We need to stop the instance right away here, set error status and go on. 490 task.setErrorStatus("cannot register instance for machine %v: %v", machine, err) 491 if err := task.broker.StopInstances(inst.Id()); err != nil { 492 // We cannot even stop the instance, log the error and quit. 493 logger.Errorf("cannot stop instance %q for machine %v: %v", inst.Id(), machine, err) 494 return err 495 } 496 return nil 497 } 498 499 func (task *provisionerTask) possibleTools(series string, cons constraints.Value) (coretools.List, error) { 500 if env, ok := task.broker.(environs.Environ); ok { 501 agentVersion, ok := env.Config().AgentVersion() 502 if !ok { 503 return nil, fmt.Errorf("no agent version set in environment configuration") 504 } 505 return tools.FindInstanceTools(env, agentVersion, series, cons.Arch) 506 } 507 if hasTools, ok := task.broker.(coretools.HasTools); ok { 508 return hasTools.Tools(series), nil 509 } 510 panic(fmt.Errorf("broker of type %T does not provide any tools", task.broker)) 511 } 512 513 type provisioningInfo struct { 514 Constraints constraints.Value 515 Series string 516 Placement string 517 MachineConfig *cloudinit.MachineConfig 518 } 519 520 func (task *provisionerTask) provisioningInfo(machine *apiprovisioner.Machine) (*provisioningInfo, error) { 521 stateInfo, apiInfo, err := task.auth.SetupAuthentication(machine) 522 if err != nil { 523 logger.Errorf("failed to setup authentication: %v", err) 524 return nil, err 525 } 526 // Generated a nonce for the new instance, with the format: "machine-#:UUID". 527 // The first part is a badge, specifying the tag of the machine the provisioner 528 // is running on, while the second part is a random UUID. 529 uuid, err := utils.NewUUID() 530 if err != nil { 531 return nil, err 532 } 533 // ProvisioningInfo is new in 1.20; wait for the API server to be upgraded 534 // so we don't spew errors on upgrade. 535 var pInfo *params.ProvisioningInfo 536 for { 537 if pInfo, err = machine.ProvisioningInfo(); err == nil { 538 break 539 } 540 if params.IsCodeNotImplemented(err) { 541 logger.Infof("waiting for state server to be upgraded") 542 select { 543 case <-task.tomb.Dying(): 544 return nil, tomb.ErrDying 545 case <-time.After(15 * time.Second): 546 continue 547 } 548 } 549 return nil, err 550 } 551 nonce := fmt.Sprintf("%s:%s", task.machineTag, uuid.String()) 552 machineConfig := environs.NewMachineConfig(machine.Id(), nonce, pInfo.Networks, stateInfo, apiInfo) 553 return &provisioningInfo{ 554 Constraints: pInfo.Constraints, 555 Series: pInfo.Series, 556 Placement: pInfo.Placement, 557 MachineConfig: machineConfig, 558 }, nil 559 }