github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/worker/provisioner/provisioner_task.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package provisioner 5 6 import ( 7 "fmt" 8 "time" 9 10 "github.com/juju/errors" 11 "github.com/juju/names" 12 "github.com/juju/utils" 13 "github.com/juju/utils/set" 14 "launchpad.net/tomb" 15 16 apiprovisioner "github.com/juju/juju/api/provisioner" 17 apiwatcher "github.com/juju/juju/api/watcher" 18 "github.com/juju/juju/apiserver/params" 19 "github.com/juju/juju/constraints" 20 "github.com/juju/juju/environmentserver/authentication" 21 "github.com/juju/juju/environs" 22 "github.com/juju/juju/environs/cloudinit" 23 "github.com/juju/juju/environs/config" 24 "github.com/juju/juju/instance" 25 "github.com/juju/juju/network" 26 "github.com/juju/juju/state/watcher" 27 coretools "github.com/juju/juju/tools" 28 "github.com/juju/juju/version" 29 "github.com/juju/juju/worker" 30 ) 31 32 type ProvisionerTask interface { 33 worker.Worker 34 Stop() error 35 Dying() <-chan struct{} 36 Err() error 37 38 // SetHarvestMode sets a flag to indicate how the provisioner task 39 // should harvest machines. See config.HarvestMode for 40 // documentation of behavior. 41 SetHarvestMode(mode config.HarvestMode) 42 } 43 44 type MachineGetter interface { 45 Machine(names.MachineTag) (*apiprovisioner.Machine, error) 46 MachinesWithTransientErrors() ([]*apiprovisioner.Machine, []params.StatusResult, error) 47 } 48 49 // ToolsFinder is an interface used for finding tools to run on 50 // provisioned instances. 51 type ToolsFinder interface { 52 // FindTools returns a list of tools matching the specified 53 // version and series, and optionally arch. 54 FindTools(version version.Number, series string, arch *string) (coretools.List, error) 55 } 56 57 var _ MachineGetter = (*apiprovisioner.State)(nil) 58 var _ ToolsFinder = (*apiprovisioner.State)(nil) 59 60 func NewProvisionerTask( 61 machineTag names.MachineTag, 62 harvestMode config.HarvestMode, 63 machineGetter MachineGetter, 64 toolsFinder ToolsFinder, 65 machineWatcher apiwatcher.StringsWatcher, 66 retryWatcher apiwatcher.NotifyWatcher, 67 broker environs.InstanceBroker, 68 auth authentication.AuthenticationProvider, 69 imageStream string, 70 secureServerConnection bool, 71 ) ProvisionerTask { 72 task := &provisionerTask{ 73 machineTag: machineTag, 74 machineGetter: machineGetter, 75 toolsFinder: toolsFinder, 76 machineWatcher: machineWatcher, 77 retryWatcher: retryWatcher, 78 broker: broker, 79 auth: auth, 80 harvestMode: harvestMode, 81 harvestModeChan: make(chan config.HarvestMode, 1), 82 machines: make(map[string]*apiprovisioner.Machine), 83 imageStream: imageStream, 84 secureServerConnection: secureServerConnection, 85 } 86 go func() { 87 defer task.tomb.Done() 88 task.tomb.Kill(task.loop()) 89 }() 90 return task 91 } 92 93 type provisionerTask struct { 94 machineTag names.MachineTag 95 machineGetter MachineGetter 96 toolsFinder ToolsFinder 97 machineWatcher apiwatcher.StringsWatcher 98 retryWatcher apiwatcher.NotifyWatcher 99 broker environs.InstanceBroker 100 tomb tomb.Tomb 101 auth authentication.AuthenticationProvider 102 imageStream string 103 secureServerConnection bool 104 harvestMode config.HarvestMode 105 harvestModeChan chan config.HarvestMode 106 // instance id -> instance 107 instances map[instance.Id]instance.Instance 108 // machine id -> machine 109 machines map[string]*apiprovisioner.Machine 110 } 111 112 // Kill implements worker.Worker.Kill. 113 func (task *provisionerTask) Kill() { 114 task.tomb.Kill(nil) 115 } 116 117 // Wait implements worker.Worker.Wait. 118 func (task *provisionerTask) Wait() error { 119 return task.tomb.Wait() 120 } 121 122 func (task *provisionerTask) Stop() error { 123 task.Kill() 124 return task.Wait() 125 } 126 127 func (task *provisionerTask) Dying() <-chan struct{} { 128 return task.tomb.Dying() 129 } 130 131 func (task *provisionerTask) Err() error { 132 return task.tomb.Err() 133 } 134 135 func (task *provisionerTask) loop() error { 136 logger.Infof("Starting up provisioner task %s", task.machineTag) 137 defer watcher.Stop(task.machineWatcher, &task.tomb) 138 139 // Don't allow the harvesting mode to change until we have read at 140 // least one set of changes, which will populate the task.machines 141 // map. Otherwise we will potentially see all legitimate instances 142 // as unknown. 143 var harvestModeChan chan config.HarvestMode 144 145 // Not all provisioners have a retry channel. 146 var retryChan <-chan struct{} 147 if task.retryWatcher != nil { 148 retryChan = task.retryWatcher.Changes() 149 } 150 151 // When the watcher is started, it will have the initial changes be all 152 // the machines that are relevant. Also, since this is available straight 153 // away, we know there will be some changes right off the bat. 154 for { 155 select { 156 case <-task.tomb.Dying(): 157 logger.Infof("Shutting down provisioner task %s", task.machineTag) 158 return tomb.ErrDying 159 case ids, ok := <-task.machineWatcher.Changes(): 160 if !ok { 161 return watcher.EnsureErr(task.machineWatcher) 162 } 163 if err := task.processMachines(ids); err != nil { 164 return errors.Annotate(err, "failed to process updated machines") 165 } 166 // We've seen a set of changes. Enable modification of 167 // harvesting mode. 168 harvestModeChan = task.harvestModeChan 169 case harvestMode := <-harvestModeChan: 170 if harvestMode == task.harvestMode { 171 break 172 } 173 174 logger.Infof("harvesting mode changed to %s", harvestMode) 175 task.harvestMode = harvestMode 176 177 if harvestMode.HarvestUnknown() { 178 179 logger.Infof("harvesting unknown machines") 180 if err := task.processMachines(nil); err != nil { 181 return errors.Annotate(err, "failed to process machines after safe mode disabled") 182 } 183 } 184 case <-retryChan: 185 if err := task.processMachinesWithTransientErrors(); err != nil { 186 return errors.Annotate(err, "failed to process machines with transient errors") 187 } 188 } 189 } 190 } 191 192 // SetHarvestMode implements ProvisionerTask.SetHarvestMode(). 193 func (task *provisionerTask) SetHarvestMode(mode config.HarvestMode) { 194 select { 195 case task.harvestModeChan <- mode: 196 case <-task.Dying(): 197 } 198 } 199 200 func (task *provisionerTask) processMachinesWithTransientErrors() error { 201 machines, statusResults, err := task.machineGetter.MachinesWithTransientErrors() 202 if err != nil { 203 return nil 204 } 205 logger.Tracef("processMachinesWithTransientErrors(%v)", statusResults) 206 var pending []*apiprovisioner.Machine 207 for i, status := range statusResults { 208 if status.Error != nil { 209 logger.Errorf("cannot retry provisioning of machine %q: %v", status.Id, status.Error) 210 continue 211 } 212 machine := machines[i] 213 if err := machine.SetStatus(params.StatusPending, "", nil); err != nil { 214 logger.Errorf("cannot reset status of machine %q: %v", status.Id, err) 215 continue 216 } 217 task.machines[machine.Tag().String()] = machine 218 pending = append(pending, machine) 219 } 220 return task.startMachines(pending) 221 } 222 223 func (task *provisionerTask) processMachines(ids []string) error { 224 logger.Tracef("processMachines(%v)", ids) 225 226 // Populate the tasks maps of current instances and machines. 227 if err := task.populateMachineMaps(ids); err != nil { 228 return err 229 } 230 231 // Find machines without an instance id or that are dead 232 pending, dead, err := task.pendingOrDead(ids) 233 if err != nil { 234 return err 235 } 236 237 // Stop all machines that are dead 238 stopping := task.instancesForMachines(dead) 239 240 // Find running instances that have no machines associated 241 unknown, err := task.findUnknownInstances(stopping) 242 if err != nil { 243 return err 244 } 245 if !task.harvestMode.HarvestUnknown() { 246 logger.Infof( 247 "%s is set to %s; unknown instances not stopped %v", 248 config.ProvisionerHarvestModeKey, 249 task.harvestMode.String(), 250 instanceIds(unknown), 251 ) 252 unknown = nil 253 } 254 if task.harvestMode.HarvestNone() || !task.harvestMode.HarvestDestroyed() { 255 logger.Infof( 256 `%s is set to "%s"; will not harvest %s`, 257 config.ProvisionerHarvestModeKey, 258 task.harvestMode.String(), 259 instanceIds(stopping), 260 ) 261 stopping = nil 262 } 263 264 if len(stopping) > 0 { 265 logger.Infof("stopping known instances %v", stopping) 266 } 267 if len(unknown) > 0 { 268 logger.Infof("stopping unknown instances %v", instanceIds(unknown)) 269 } 270 // It's important that we stop unknown instances before starting 271 // pending ones, because if we start an instance and then fail to 272 // set its InstanceId on the machine we don't want to start a new 273 // instance for the same machine ID. 274 if err := task.stopInstances(append(stopping, unknown...)); err != nil { 275 return err 276 } 277 278 // Remove any dead machines from state. 279 for _, machine := range dead { 280 logger.Infof("removing dead machine %q", machine) 281 if err := machine.Remove(); err != nil { 282 logger.Errorf("failed to remove dead machine %q", machine) 283 } 284 delete(task.machines, machine.Id()) 285 } 286 287 // Start an instance for the pending ones 288 return task.startMachines(pending) 289 } 290 291 func instanceIds(instances []instance.Instance) []string { 292 ids := make([]string, 0, len(instances)) 293 for _, inst := range instances { 294 ids = append(ids, string(inst.Id())) 295 } 296 return ids 297 } 298 299 // populateMachineMaps updates task.instances. Also updates 300 // task.machines map if a list of IDs is given. 301 func (task *provisionerTask) populateMachineMaps(ids []string) error { 302 task.instances = make(map[instance.Id]instance.Instance) 303 304 instances, err := task.broker.AllInstances() 305 if err != nil { 306 return errors.Annotate(err, "failed to get all instances from broker") 307 } 308 for _, i := range instances { 309 task.instances[i.Id()] = i 310 } 311 312 // Update the machines map with new data for each of the machines in the 313 // change list. 314 // TODO(thumper): update for API server later to get all machines in one go. 315 for _, id := range ids { 316 machineTag := names.NewMachineTag(id) 317 machine, err := task.machineGetter.Machine(machineTag) 318 switch { 319 case params.IsCodeNotFoundOrCodeUnauthorized(err): 320 logger.Debugf("machine %q not found in state", id) 321 delete(task.machines, id) 322 case err == nil: 323 task.machines[id] = machine 324 default: 325 return errors.Annotatef(err, "failed to get machine %v", id) 326 } 327 } 328 return nil 329 } 330 331 // pendingOrDead looks up machines with ids and returns those that do not 332 // have an instance id assigned yet, and also those that are dead. 333 func (task *provisionerTask) pendingOrDead(ids []string) (pending, dead []*apiprovisioner.Machine, err error) { 334 for _, id := range ids { 335 machine, found := task.machines[id] 336 if !found { 337 logger.Infof("machine %q not found", id) 338 continue 339 } 340 switch machine.Life() { 341 case params.Dying: 342 if _, err := machine.InstanceId(); err == nil { 343 continue 344 } else if !params.IsCodeNotProvisioned(err) { 345 return nil, nil, errors.Annotatef(err, "failed to load machine %q instance id: %v", machine) 346 } 347 logger.Infof("killing dying, unprovisioned machine %q", machine) 348 if err := machine.EnsureDead(); err != nil { 349 return nil, nil, errors.Annotatef(err, "failed to ensure machine dead %q: %v", machine) 350 } 351 fallthrough 352 case params.Dead: 353 dead = append(dead, machine) 354 continue 355 } 356 if instId, err := machine.InstanceId(); err != nil { 357 if !params.IsCodeNotProvisioned(err) { 358 logger.Errorf("failed to load machine %q instance id: %v", machine, err) 359 continue 360 } 361 status, _, err := machine.Status() 362 if err != nil { 363 logger.Infof("cannot get machine %q status: %v", machine, err) 364 continue 365 } 366 if status == params.StatusPending { 367 pending = append(pending, machine) 368 logger.Infof("found machine %q pending provisioning", machine) 369 continue 370 } 371 } else { 372 logger.Infof("machine %v already started as instance %q", machine, instId) 373 } 374 } 375 logger.Tracef("pending machines: %v", pending) 376 logger.Tracef("dead machines: %v", dead) 377 return 378 } 379 380 // findUnknownInstances finds instances which are not associated with a machine. 381 func (task *provisionerTask) findUnknownInstances(stopping []instance.Instance) ([]instance.Instance, error) { 382 // Make a copy of the instances we know about. 383 instances := make(map[instance.Id]instance.Instance) 384 for k, v := range task.instances { 385 instances[k] = v 386 } 387 388 for _, m := range task.machines { 389 instId, err := m.InstanceId() 390 switch { 391 case err == nil: 392 delete(instances, instId) 393 case params.IsCodeNotProvisioned(err): 394 case params.IsCodeNotFoundOrCodeUnauthorized(err): 395 default: 396 return nil, err 397 } 398 } 399 // Now remove all those instances that we are stopping already as we 400 // know about those and don't want to include them in the unknown list. 401 for _, inst := range stopping { 402 delete(instances, inst.Id()) 403 } 404 var unknown []instance.Instance 405 for _, inst := range instances { 406 unknown = append(unknown, inst) 407 } 408 return unknown, nil 409 } 410 411 // instancesForMachines returns a list of instance.Instance that represent 412 // the list of machines running in the provider. Missing machines are 413 // omitted from the list. 414 func (task *provisionerTask) instancesForMachines(machines []*apiprovisioner.Machine) []instance.Instance { 415 var instances []instance.Instance 416 for _, machine := range machines { 417 instId, err := machine.InstanceId() 418 if err == nil { 419 instance, found := task.instances[instId] 420 // If the instance is not found we can't stop it. 421 if found { 422 instances = append(instances, instance) 423 } 424 } 425 } 426 return instances 427 } 428 429 func (task *provisionerTask) stopInstances(instances []instance.Instance) error { 430 // Although calling StopInstance with an empty slice should produce no change in the 431 // provider, environs like dummy do not consider this a noop. 432 if len(instances) == 0 { 433 return nil 434 } 435 ids := make([]instance.Id, len(instances)) 436 for i, inst := range instances { 437 ids[i] = inst.Id() 438 } 439 if err := task.broker.StopInstances(ids...); err != nil { 440 return errors.Annotate(err, "broker failed to stop instances") 441 } 442 return nil 443 } 444 445 func (task *provisionerTask) constructMachineConfig( 446 machine *apiprovisioner.Machine, 447 auth authentication.AuthenticationProvider, 448 pInfo *params.ProvisioningInfo, 449 ) (*cloudinit.MachineConfig, error) { 450 451 stateInfo, apiInfo, err := auth.SetupAuthentication(machine) 452 if err != nil { 453 return nil, errors.Annotate(err, "failed to setup authentication") 454 } 455 456 // Generated a nonce for the new instance, with the format: "machine-#:UUID". 457 // The first part is a badge, specifying the tag of the machine the provisioner 458 // is running on, while the second part is a random UUID. 459 uuid, err := utils.NewUUID() 460 if err != nil { 461 return nil, errors.Annotate(err, "failed to generate a nonce for machine "+machine.Id()) 462 } 463 464 nonce := fmt.Sprintf("%s:%s", task.machineTag, uuid) 465 return environs.NewMachineConfig( 466 machine.Id(), 467 nonce, 468 task.imageStream, 469 pInfo.Series, 470 task.secureServerConnection, 471 nil, 472 stateInfo, 473 apiInfo, 474 ) 475 } 476 477 func constructStartInstanceParams( 478 machine *apiprovisioner.Machine, 479 machineConfig *cloudinit.MachineConfig, 480 provisioningInfo *params.ProvisioningInfo, 481 possibleTools coretools.List, 482 ) environs.StartInstanceParams { 483 return environs.StartInstanceParams{ 484 Constraints: provisioningInfo.Constraints, 485 Tools: possibleTools, 486 MachineConfig: machineConfig, 487 Placement: provisioningInfo.Placement, 488 DistributionGroup: machine.DistributionGroup, 489 Volumes: provisioningInfo.Volumes, 490 } 491 } 492 493 func (task *provisionerTask) startMachines(machines []*apiprovisioner.Machine) error { 494 for _, m := range machines { 495 496 pInfo, err := task.blockUntilProvisioned(m.ProvisioningInfo) 497 if err != nil { 498 return err 499 } 500 501 machineCfg, err := task.constructMachineConfig(m, task.auth, pInfo) 502 if err != nil { 503 return err 504 } 505 506 assocProvInfoAndMachCfg(pInfo, machineCfg) 507 508 possibleTools, err := task.toolsFinder.FindTools( 509 version.Current.Number, 510 pInfo.Series, 511 pInfo.Constraints.Arch, 512 ) 513 if err != nil { 514 return task.setErrorStatus("cannot find tools for machine %q: %v", m, err) 515 } 516 517 startInstanceParams := constructStartInstanceParams( 518 m, 519 machineCfg, 520 pInfo, 521 possibleTools, 522 ) 523 524 if err := task.startMachine(m, pInfo, startInstanceParams); err != nil { 525 return errors.Annotatef(err, "cannot start machine %v", m) 526 } 527 } 528 return nil 529 } 530 531 func (task *provisionerTask) setErrorStatus(message string, machine *apiprovisioner.Machine, err error) error { 532 logger.Errorf(message, machine, err) 533 if err1 := machine.SetStatus(params.StatusError, err.Error(), nil); err1 != nil { 534 // Something is wrong with this machine, better report it back. 535 return errors.Annotatef(err1, "cannot set error status for machine %q", machine) 536 } 537 return nil 538 } 539 540 func (task *provisionerTask) prepareNetworkAndInterfaces(networkInfo []network.InterfaceInfo) ( 541 networks []params.Network, ifaces []params.NetworkInterface) { 542 if len(networkInfo) == 0 { 543 return nil, nil 544 } 545 visitedNetworks := set.NewStrings() 546 for _, info := range networkInfo { 547 networkTag := names.NewNetworkTag(info.NetworkName).String() 548 if !visitedNetworks.Contains(networkTag) { 549 networks = append(networks, params.Network{ 550 Tag: networkTag, 551 ProviderId: info.ProviderId, 552 CIDR: info.CIDR, 553 VLANTag: info.VLANTag, 554 }) 555 visitedNetworks.Add(networkTag) 556 } 557 ifaces = append(ifaces, params.NetworkInterface{ 558 InterfaceName: info.ActualInterfaceName(), 559 MACAddress: info.MACAddress, 560 NetworkTag: networkTag, 561 IsVirtual: info.IsVirtual(), 562 Disabled: info.Disabled, 563 }) 564 } 565 return networks, ifaces 566 } 567 568 func (task *provisionerTask) startMachine( 569 machine *apiprovisioner.Machine, 570 provisioningInfo *params.ProvisioningInfo, 571 startInstanceParams environs.StartInstanceParams, 572 ) error { 573 574 result, err := task.broker.StartInstance(startInstanceParams) 575 if err != nil { 576 // If this is a retryable error, we retry once 577 if instance.IsRetryableCreationError(errors.Cause(err)) { 578 logger.Infof("retryable error received on start instance - retrying instance creation") 579 result, err = task.broker.StartInstance(startInstanceParams) 580 if err != nil { 581 return task.setErrorStatus("cannot start instance for machine after a retry %q: %v", machine, err) 582 } 583 } else { 584 // Set the state to error, so the machine will be skipped next 585 // time until the error is resolved, but don't return an 586 // error; just keep going with the other machines. 587 return task.setErrorStatus("cannot start instance for machine %q: %v", machine, err) 588 } 589 } 590 591 inst := result.Instance 592 hardware := result.Hardware 593 nonce := startInstanceParams.MachineConfig.MachineNonce 594 networks, ifaces := task.prepareNetworkAndInterfaces(result.NetworkInfo) 595 volumes := result.Volumes 596 597 // TODO(dimitern) In a newer Provisioner API version, change 598 // SetInstanceInfo or add a new method that takes and saves in 599 // state all the information available on a network.InterfaceInfo 600 // for each interface, so we can later manage interfaces 601 // dynamically at run-time. 602 err = machine.SetInstanceInfo(inst.Id(), nonce, hardware, networks, ifaces, volumes) 603 if err != nil && params.IsCodeNotImplemented(err) { 604 return fmt.Errorf("cannot provision instance %v for machine %q with networks: not implemented", inst.Id(), machine) 605 } else if err == nil { 606 logger.Infof("started machine %s as instance %s with hardware %q, networks %v, interfaces %v, volumes %v", machine, inst.Id(), hardware, networks, ifaces, volumes) 607 return nil 608 } 609 // We need to stop the instance right away here, set error status and go on. 610 task.setErrorStatus("cannot register instance for machine %v: %v", machine, err) 611 if err := task.broker.StopInstances(inst.Id()); err != nil { 612 // We cannot even stop the instance, log the error and quit. 613 return errors.Annotatef(err, "cannot stop instance %q for machine %v", inst.Id(), machine) 614 } 615 return nil 616 } 617 618 type provisioningInfo struct { 619 Constraints constraints.Value 620 Series string 621 Placement string 622 MachineConfig *cloudinit.MachineConfig 623 } 624 625 func assocProvInfoAndMachCfg( 626 provInfo *params.ProvisioningInfo, 627 machineConfig *cloudinit.MachineConfig, 628 ) *provisioningInfo { 629 630 machineConfig.Networks = provInfo.Networks 631 632 if len(provInfo.Jobs) > 0 { 633 machineConfig.Jobs = provInfo.Jobs 634 } 635 636 return &provisioningInfo{ 637 Constraints: provInfo.Constraints, 638 Series: provInfo.Series, 639 Placement: provInfo.Placement, 640 MachineConfig: machineConfig, 641 } 642 } 643 644 // ProvisioningInfo is new in 1.20; wait for the API server to be 645 // upgraded so we don't spew errors on upgrade. 646 func (task *provisionerTask) blockUntilProvisioned( 647 provision func() (*params.ProvisioningInfo, error), 648 ) (*params.ProvisioningInfo, error) { 649 650 var pInfo *params.ProvisioningInfo 651 var err error 652 for { 653 if pInfo, err = provision(); err == nil { 654 break 655 } 656 if params.IsCodeNotImplemented(err) { 657 logger.Infof("waiting for state server to be upgraded") 658 select { 659 case <-task.tomb.Dying(): 660 return nil, tomb.ErrDying 661 case <-time.After(15 * time.Second): 662 continue 663 } 664 } 665 return nil, err 666 } 667 668 return pInfo, nil 669 }