github.com/smithx10/nomad@v0.9.1-rc1/client/allocrunner/alloc_runner.go (about) 1 package allocrunner 2 3 import ( 4 "context" 5 "fmt" 6 "path/filepath" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 "github.com/hashicorp/nomad/client/allocdir" 12 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 13 "github.com/hashicorp/nomad/client/allocrunner/state" 14 "github.com/hashicorp/nomad/client/allocrunner/taskrunner" 15 "github.com/hashicorp/nomad/client/allocwatcher" 16 "github.com/hashicorp/nomad/client/config" 17 "github.com/hashicorp/nomad/client/consul" 18 "github.com/hashicorp/nomad/client/devicemanager" 19 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 20 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 21 cstate "github.com/hashicorp/nomad/client/state" 22 cstructs "github.com/hashicorp/nomad/client/structs" 23 "github.com/hashicorp/nomad/client/vaultclient" 24 "github.com/hashicorp/nomad/helper" 25 "github.com/hashicorp/nomad/nomad/structs" 26 "github.com/hashicorp/nomad/plugins/device" 27 "github.com/hashicorp/nomad/plugins/drivers" 28 ) 29 30 // allocRunner is used to run all the tasks in a given allocation 31 type allocRunner struct { 32 // id is the ID of the allocation. Can be accessed without a lock 33 id string 34 35 // Logger is the logger for the alloc runner. 36 logger log.Logger 37 38 clientConfig *config.Config 39 40 // stateUpdater is used to emit updated alloc state 41 stateUpdater cinterfaces.AllocStateHandler 42 43 // taskStateUpdatedCh is ticked whenever task state as changed. Must 44 // have len==1 to allow nonblocking notification of state updates while 45 // the goroutine is already processing a previous update. 46 taskStateUpdatedCh chan struct{} 47 48 // taskStateUpdateHandlerCh is closed when the task state handling 49 // goroutine exits. It is unsafe to destroy the local allocation state 50 // before this goroutine exits. 51 taskStateUpdateHandlerCh chan struct{} 52 53 // allocUpdatedCh is a channel that is used to stream allocation updates into 54 // the allocUpdate handler. Must have len==1 to allow nonblocking notification 55 // of new allocation updates while the goroutine is processing a previous 56 // update. 57 allocUpdatedCh chan *structs.Allocation 58 59 // consulClient is the client used by the consul service hook for 60 // registering services and checks 61 consulClient consul.ConsulServiceAPI 62 63 // vaultClient is the used to manage Vault tokens 64 vaultClient vaultclient.VaultClient 65 66 // waitCh is closed when the Run loop has exited 67 waitCh chan struct{} 68 69 // destroyed is true when the Run loop has exited, postrun hooks have 70 // run, and alloc runner has been destroyed. Must acquire destroyedLock 71 // to access. 72 destroyed bool 73 74 // destroyCh is closed when the Run loop has exited, postrun hooks have 75 // run, and alloc runner has been destroyed. 76 destroyCh chan struct{} 77 78 // shutdown is true when the Run loop has exited, and shutdown hooks have 79 // run. Must acquire destroyedLock to access. 80 shutdown bool 81 82 // shutdownCh is closed when the Run loop has exited, and shutdown hooks 83 // have run. 84 shutdownCh chan struct{} 85 86 // destroyLaunched is true if Destroy has been called. Must acquire 87 // destroyedLock to access. 88 destroyLaunched bool 89 90 // shutdownLaunched is true if Shutdown has been called. Must acquire 91 // destroyedLock to access. 92 shutdownLaunched bool 93 94 // destroyedLock guards destroyed, destroyLaunched, shutdownLaunched, 95 // and serializes Shutdown/Destroy calls. 96 destroyedLock sync.Mutex 97 98 // Alloc captures the allocation being run. 99 alloc *structs.Allocation 100 allocLock sync.RWMutex 101 102 // state is the alloc runner's state 103 state *state.State 104 stateLock sync.RWMutex 105 106 stateDB cstate.StateDB 107 108 // allocDir is used to build the allocations directory structure. 109 allocDir *allocdir.AllocDir 110 111 // runnerHooks are alloc runner lifecycle hooks that should be run on state 112 // transistions. 113 runnerHooks []interfaces.RunnerHook 114 115 // tasks are the set of task runners 116 tasks map[string]*taskrunner.TaskRunner 117 118 // deviceStatsReporter is used to lookup resource usage for alloc devices 119 deviceStatsReporter cinterfaces.DeviceStatsReporter 120 121 // allocBroadcaster sends client allocation updates to all listeners 122 allocBroadcaster *cstructs.AllocBroadcaster 123 124 // prevAllocWatcher allows waiting for any previous or preempted allocations 125 // to exit 126 prevAllocWatcher allocwatcher.PrevAllocWatcher 127 128 // prevAllocMigrator allows the migration of a previous allocations alloc dir. 129 prevAllocMigrator allocwatcher.PrevAllocMigrator 130 131 // devicemanager is used to mount devices as well as lookup device 132 // statistics 133 devicemanager devicemanager.Manager 134 135 // driverManager is responsible for dispensing driver plugins and registering 136 // event handlers 137 driverManager drivermanager.Manager 138 } 139 140 // NewAllocRunner returns a new allocation runner. 141 func NewAllocRunner(config *Config) (*allocRunner, error) { 142 alloc := config.Alloc 143 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 144 if tg == nil { 145 return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup) 146 } 147 148 ar := &allocRunner{ 149 id: alloc.ID, 150 alloc: alloc, 151 clientConfig: config.ClientConfig, 152 consulClient: config.Consul, 153 vaultClient: config.Vault, 154 tasks: make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)), 155 waitCh: make(chan struct{}), 156 destroyCh: make(chan struct{}), 157 shutdownCh: make(chan struct{}), 158 state: &state.State{}, 159 stateDB: config.StateDB, 160 stateUpdater: config.StateUpdater, 161 taskStateUpdatedCh: make(chan struct{}, 1), 162 taskStateUpdateHandlerCh: make(chan struct{}), 163 allocUpdatedCh: make(chan *structs.Allocation, 1), 164 deviceStatsReporter: config.DeviceStatsReporter, 165 prevAllocWatcher: config.PrevAllocWatcher, 166 prevAllocMigrator: config.PrevAllocMigrator, 167 devicemanager: config.DeviceManager, 168 driverManager: config.DriverManager, 169 } 170 171 // Create the logger based on the allocation ID 172 ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID) 173 174 // Create alloc broadcaster 175 ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger) 176 177 // Create alloc dir 178 ar.allocDir = allocdir.NewAllocDir(ar.logger, filepath.Join(config.ClientConfig.AllocDir, alloc.ID)) 179 180 // Initialize the runners hooks. 181 ar.initRunnerHooks() 182 183 // Create the TaskRunners 184 if err := ar.initTaskRunners(tg.Tasks); err != nil { 185 return nil, err 186 } 187 188 return ar, nil 189 } 190 191 // initTaskRunners creates task runners but does *not* run them. 192 func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error { 193 for _, task := range tasks { 194 config := &taskrunner.Config{ 195 Alloc: ar.alloc, 196 ClientConfig: ar.clientConfig, 197 Task: task, 198 TaskDir: ar.allocDir.NewTaskDir(task.Name), 199 Logger: ar.logger, 200 StateDB: ar.stateDB, 201 StateUpdater: ar, 202 Consul: ar.consulClient, 203 Vault: ar.vaultClient, 204 DeviceStatsReporter: ar.deviceStatsReporter, 205 DeviceManager: ar.devicemanager, 206 DriverManager: ar.driverManager, 207 } 208 209 // Create, but do not Run, the task runner 210 tr, err := taskrunner.NewTaskRunner(config) 211 if err != nil { 212 return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err) 213 } 214 215 ar.tasks[task.Name] = tr 216 } 217 return nil 218 } 219 220 func (ar *allocRunner) WaitCh() <-chan struct{} { 221 return ar.waitCh 222 } 223 224 // Run the AllocRunner. Starts tasks if the alloc is non-terminal and closes 225 // WaitCh when it exits. Should be started in a goroutine. 226 func (ar *allocRunner) Run() { 227 // Close the wait channel on return 228 defer close(ar.waitCh) 229 230 // Start the task state update handler 231 go ar.handleTaskStateUpdates() 232 233 // Start the alloc update handler 234 go ar.handleAllocUpdates() 235 236 // If task update chan has been closed, that means we've been shutdown. 237 select { 238 case <-ar.taskStateUpdateHandlerCh: 239 return 240 default: 241 } 242 243 // Run the prestart hooks if non-terminal 244 if ar.shouldRun() { 245 if err := ar.prerun(); err != nil { 246 ar.logger.Error("prerun failed", "error", err) 247 goto POST 248 } 249 } 250 251 // Run the runners (blocks until they exit) 252 ar.runTasks() 253 254 POST: 255 // Run the postrun hooks 256 if err := ar.postrun(); err != nil { 257 ar.logger.Error("postrun failed", "error", err) 258 } 259 260 } 261 262 // shouldRun returns true if the alloc is in a state that the alloc runner 263 // should run it. 264 func (ar *allocRunner) shouldRun() bool { 265 // Do not run allocs that are terminal 266 if ar.Alloc().TerminalStatus() { 267 ar.logger.Trace("alloc terminal; not running", 268 "desired_status", ar.Alloc().DesiredStatus, 269 "client_status", ar.Alloc().ClientStatus, 270 ) 271 return false 272 } 273 274 // It's possible that the alloc local state was marked terminal before 275 // the server copy of the alloc (checked above) was marked as terminal, 276 // so check the local state as well. 277 switch clientStatus := ar.AllocState().ClientStatus; clientStatus { 278 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost: 279 ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus) 280 return false 281 } 282 283 return true 284 } 285 286 // runTasks is used to run the task runners and block until they exit. 287 func (ar *allocRunner) runTasks() { 288 for _, task := range ar.tasks { 289 go task.Run() 290 } 291 292 for _, task := range ar.tasks { 293 <-task.WaitCh() 294 } 295 } 296 297 // Alloc returns the current allocation being run by this runner as sent by the 298 // server. This view of the allocation does not have updated task states. 299 func (ar *allocRunner) Alloc() *structs.Allocation { 300 ar.allocLock.RLock() 301 defer ar.allocLock.RUnlock() 302 return ar.alloc 303 } 304 305 func (ar *allocRunner) setAlloc(updated *structs.Allocation) { 306 ar.allocLock.Lock() 307 ar.alloc = updated 308 ar.allocLock.Unlock() 309 } 310 311 // GetAllocDir returns the alloc dir which is safe for concurrent use. 312 func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir { 313 return ar.allocDir 314 } 315 316 // Restore state from database. Must be called after NewAllocRunner but before 317 // Run. 318 func (ar *allocRunner) Restore() error { 319 // Retrieve deployment status to avoid reseting it across agent 320 // restarts. Once a deployment status is set Nomad no longer monitors 321 // alloc health, so we must persist deployment state across restarts. 322 ds, err := ar.stateDB.GetDeploymentStatus(ar.id) 323 if err != nil { 324 return err 325 } 326 327 ar.stateLock.Lock() 328 ar.state.DeploymentStatus = ds 329 ar.stateLock.Unlock() 330 331 // Restore task runners 332 for _, tr := range ar.tasks { 333 if err := tr.Restore(); err != nil { 334 return err 335 } 336 } 337 338 return nil 339 } 340 341 // persistDeploymentStatus stores AllocDeploymentStatus. 342 func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) { 343 if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil { 344 // While any persistence errors are very bad, the worst case 345 // scenario for failing to persist deployment status is that if 346 // the agent is restarted it will monitor the deployment status 347 // again. This could cause a deployment's status to change when 348 // that shouldn't happen. However, allowing that seems better 349 // than failing the entire allocation. 350 ar.logger.Error("error storing deployment status", "error", err) 351 } 352 } 353 354 // TaskStateUpdated is called by TaskRunner when a task's state has been 355 // updated. It does not process the update synchronously but instead notifies a 356 // goroutine the state has change. Since processing the state change may cause 357 // the task to be killed (thus change its state again) it cannot be done 358 // synchronously as it would cause a deadlock due to reentrancy. 359 // 360 // The goroutine is used to compute changes to the alloc's ClientStatus and to 361 // update the server with the new state. 362 func (ar *allocRunner) TaskStateUpdated() { 363 select { 364 case ar.taskStateUpdatedCh <- struct{}{}: 365 default: 366 // already pending updates 367 } 368 } 369 370 // handleTaskStateUpdates must be run in goroutine as it monitors 371 // taskStateUpdatedCh for task state update notifications and processes task 372 // states. 373 // 374 // Processing task state updates must be done in a goroutine as it may have to 375 // kill tasks which causes further task state updates. 376 func (ar *allocRunner) handleTaskStateUpdates() { 377 defer close(ar.taskStateUpdateHandlerCh) 378 379 for done := false; !done; { 380 select { 381 case <-ar.taskStateUpdatedCh: 382 case <-ar.waitCh: 383 // Run has exited, sync once more to ensure final 384 // states are collected. 385 done = true 386 } 387 388 ar.logger.Trace("handling task state update", "done", done) 389 390 // Set with the appropriate event if task runners should be 391 // killed. 392 var killEvent *structs.TaskEvent 393 394 // If task runners should be killed, this is set to the task 395 // name whose fault it is. 396 killTask := "" 397 398 // True if task runners should be killed because a leader 399 // failed (informational). 400 leaderFailed := false 401 402 // Task state has been updated; gather the state of the other tasks 403 trNum := len(ar.tasks) 404 liveRunners := make([]*taskrunner.TaskRunner, 0, trNum) 405 states := make(map[string]*structs.TaskState, trNum) 406 407 for name, tr := range ar.tasks { 408 state := tr.TaskState() 409 states[name] = state 410 411 // Capture live task runners in case we need to kill them 412 if state.State != structs.TaskStateDead { 413 liveRunners = append(liveRunners, tr) 414 continue 415 } 416 417 // Task is dead, determine if other tasks should be killed 418 if state.Failed { 419 // Only set failed event if no event has been 420 // set yet to give dead leaders priority. 421 if killEvent == nil { 422 killTask = name 423 killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed). 424 SetFailedSibling(name) 425 } 426 } else if tr.IsLeader() { 427 killEvent = structs.NewTaskEvent(structs.TaskLeaderDead) 428 leaderFailed = true 429 killTask = name 430 } 431 } 432 433 // If there's a kill event set and live runners, kill them 434 if killEvent != nil && len(liveRunners) > 0 { 435 436 // Log kill reason 437 if leaderFailed { 438 ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask) 439 } else { 440 ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask) 441 } 442 443 // Emit kill event for live runners 444 for _, tr := range liveRunners { 445 tr.EmitEvent(killEvent) 446 } 447 448 // Kill 'em all 449 states = ar.killTasks() 450 451 // Wait for TaskRunners to exit before continuing to 452 // prevent looping before TaskRunners have transitioned 453 // to Dead. 454 for _, tr := range liveRunners { 455 select { 456 case <-tr.WaitCh(): 457 case <-ar.waitCh: 458 } 459 } 460 } 461 462 // Get the client allocation 463 calloc := ar.clientAlloc(states) 464 465 // Update the server 466 ar.stateUpdater.AllocStateUpdated(calloc) 467 468 // Broadcast client alloc to listeners 469 ar.allocBroadcaster.Send(calloc) 470 } 471 } 472 473 // killTasks kills all task runners, leader (if there is one) first. Errors are 474 // logged except taskrunner.ErrTaskNotRunning which is ignored. Task states 475 // after Kill has been called are returned. 476 func (ar *allocRunner) killTasks() map[string]*structs.TaskState { 477 var mu sync.Mutex 478 states := make(map[string]*structs.TaskState, len(ar.tasks)) 479 480 // Kill leader first, synchronously 481 for name, tr := range ar.tasks { 482 if !tr.IsLeader() { 483 continue 484 } 485 486 err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilling)) 487 if err != nil && err != taskrunner.ErrTaskNotRunning { 488 ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) 489 } 490 491 state := tr.TaskState() 492 states[name] = state 493 break 494 } 495 496 // Kill the rest concurrently 497 wg := sync.WaitGroup{} 498 for name, tr := range ar.tasks { 499 if tr.IsLeader() { 500 continue 501 } 502 503 wg.Add(1) 504 go func(name string, tr *taskrunner.TaskRunner) { 505 defer wg.Done() 506 err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilling)) 507 if err != nil && err != taskrunner.ErrTaskNotRunning { 508 ar.logger.Warn("error stopping task", "error", err, "task_name", name) 509 } 510 511 state := tr.TaskState() 512 mu.Lock() 513 states[name] = state 514 mu.Unlock() 515 }(name, tr) 516 } 517 wg.Wait() 518 519 return states 520 } 521 522 // clientAlloc takes in the task states and returns an Allocation populated 523 // with Client specific fields 524 func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation { 525 ar.stateLock.Lock() 526 defer ar.stateLock.Unlock() 527 528 // store task states for AllocState to expose 529 ar.state.TaskStates = taskStates 530 531 a := &structs.Allocation{ 532 ID: ar.id, 533 TaskStates: taskStates, 534 } 535 536 if d := ar.state.DeploymentStatus; d != nil { 537 a.DeploymentStatus = d.Copy() 538 } 539 540 // Compute the ClientStatus 541 if ar.state.ClientStatus != "" { 542 // The client status is being forced 543 a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription 544 } else { 545 a.ClientStatus, a.ClientDescription = getClientStatus(taskStates) 546 } 547 548 // If the allocation is terminal, make sure all required fields are properly 549 // set. 550 if a.ClientTerminalStatus() { 551 alloc := ar.Alloc() 552 553 // If we are part of a deployment and the task has failed, mark the 554 // alloc as unhealthy. This guards against the watcher not be started. 555 if a.ClientStatus == structs.AllocClientStatusFailed && 556 alloc.DeploymentID != "" && !a.DeploymentStatus.IsUnhealthy() { 557 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 558 Healthy: helper.BoolToPtr(false), 559 } 560 } 561 562 // Make sure we have marked the finished at for every task. This is used 563 // to calculate the reschedule time for failed allocations. 564 now := time.Now() 565 for _, task := range alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks { 566 ts, ok := a.TaskStates[task.Name] 567 if !ok { 568 ts = &structs.TaskState{} 569 a.TaskStates[task.Name] = ts 570 } 571 if ts.FinishedAt.IsZero() { 572 ts.FinishedAt = now 573 } 574 } 575 } 576 577 return a 578 } 579 580 // getClientStatus takes in the task states for a given allocation and computes 581 // the client status and description 582 func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) { 583 var pending, running, dead, failed bool 584 for _, state := range taskStates { 585 switch state.State { 586 case structs.TaskStateRunning: 587 running = true 588 case structs.TaskStatePending: 589 pending = true 590 case structs.TaskStateDead: 591 if state.Failed { 592 failed = true 593 } else { 594 dead = true 595 } 596 } 597 } 598 599 // Determine the alloc status 600 if failed { 601 return structs.AllocClientStatusFailed, "Failed tasks" 602 } else if running { 603 return structs.AllocClientStatusRunning, "Tasks are running" 604 } else if pending { 605 return structs.AllocClientStatusPending, "No tasks have started" 606 } else if dead { 607 return structs.AllocClientStatusComplete, "All tasks have completed" 608 } 609 610 return "", "" 611 } 612 613 // SetClientStatus is a helper for forcing a specific client 614 // status on the alloc runner. This is used during restore errors 615 // when the task state can't be restored. 616 func (ar *allocRunner) SetClientStatus(clientStatus string) { 617 ar.stateLock.Lock() 618 defer ar.stateLock.Unlock() 619 ar.state.ClientStatus = clientStatus 620 } 621 622 // AllocState returns a copy of allocation state including a snapshot of task 623 // states. 624 func (ar *allocRunner) AllocState() *state.State { 625 ar.stateLock.RLock() 626 state := ar.state.Copy() 627 ar.stateLock.RUnlock() 628 629 // If TaskStateUpdated has not been called yet, ar.state.TaskStates 630 // won't be set as it is not the canonical source of TaskStates. 631 if len(state.TaskStates) == 0 { 632 ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks)) 633 for k, tr := range ar.tasks { 634 state.TaskStates[k] = tr.TaskState() 635 } 636 } 637 638 // Generate alloc to get other state fields 639 alloc := ar.clientAlloc(state.TaskStates) 640 state.ClientStatus = alloc.ClientStatus 641 state.ClientDescription = alloc.ClientDescription 642 state.DeploymentStatus = alloc.DeploymentStatus 643 644 return state 645 } 646 647 // Update asyncronously updates the running allocation with a new version 648 // received from the server. 649 // When processing a new update, we will first attempt to drain stale updates 650 // from the queue, before appending the new one. 651 func (ar *allocRunner) Update(update *structs.Allocation) { 652 select { 653 // Drain queued update from the channel if possible, and check the modify 654 // index 655 case oldUpdate := <-ar.allocUpdatedCh: 656 // If the old update is newer than the replacement, then skip the new one 657 // and return. This case shouldn't happen, but may in the case of a bug 658 // elsewhere inside the system. 659 if oldUpdate.AllocModifyIndex > update.AllocModifyIndex { 660 ar.logger.Debug("Discarding allocation update due to newer alloc revision in queue", 661 "old_modify_index", oldUpdate.AllocModifyIndex, 662 "new_modify_index", update.AllocModifyIndex) 663 ar.allocUpdatedCh <- oldUpdate 664 return 665 } else { 666 ar.logger.Debug("Discarding allocation update", 667 "skipped_modify_index", oldUpdate.AllocModifyIndex, 668 "new_modify_index", update.AllocModifyIndex) 669 } 670 case <-ar.waitCh: 671 ar.logger.Trace("AllocRunner has terminated, skipping alloc update", 672 "modify_index", update.AllocModifyIndex) 673 return 674 default: 675 } 676 677 // Queue the new update 678 ar.allocUpdatedCh <- update 679 } 680 681 func (ar *allocRunner) handleAllocUpdates() { 682 for { 683 select { 684 case update := <-ar.allocUpdatedCh: 685 ar.handleAllocUpdate(update) 686 case <-ar.waitCh: 687 return 688 } 689 } 690 } 691 692 // This method sends the updated alloc to Run for serially processing updates. 693 // If there is already a pending update it will be discarded and replaced by 694 // the latest update. 695 func (ar *allocRunner) handleAllocUpdate(update *structs.Allocation) { 696 // Detect Stop updates 697 stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus() 698 699 // Update ar.alloc 700 ar.setAlloc(update) 701 702 // Run update hooks if not stopping or dead 703 if !update.TerminalStatus() { 704 if err := ar.update(update); err != nil { 705 ar.logger.Error("error running update hooks", "error", err) 706 } 707 708 } 709 710 // Update task runners 711 for _, tr := range ar.tasks { 712 tr.Update(update) 713 } 714 715 // If alloc is being terminated, kill all tasks, leader first 716 if stopping { 717 ar.killTasks() 718 } 719 720 } 721 722 func (ar *allocRunner) Listener() *cstructs.AllocListener { 723 return ar.allocBroadcaster.Listen() 724 } 725 726 func (ar *allocRunner) destroyImpl() { 727 // Stop any running tasks and persist states in case the client is 728 // shutdown before Destroy finishes. 729 states := ar.killTasks() 730 calloc := ar.clientAlloc(states) 731 ar.stateUpdater.AllocStateUpdated(calloc) 732 733 // Wait for tasks to exit and postrun hooks to finish 734 <-ar.waitCh 735 736 // Run destroy hooks 737 if err := ar.destroy(); err != nil { 738 ar.logger.Warn("error running destroy hooks", "error", err) 739 } 740 741 // Wait for task state update handler to exit before removing local 742 // state if Run() ran at all. 743 <-ar.taskStateUpdateHandlerCh 744 745 // Cleanup state db 746 if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil { 747 ar.logger.Warn("failed to delete allocation state", "error", err) 748 } 749 750 // Mark alloc as destroyed 751 ar.destroyedLock.Lock() 752 753 if !ar.shutdown { 754 ar.shutdown = true 755 close(ar.shutdownCh) 756 } 757 758 ar.destroyed = true 759 close(ar.destroyCh) 760 761 ar.destroyedLock.Unlock() 762 } 763 764 // Destroy the alloc runner by stopping it if it is still running and cleaning 765 // up all of its resources. 766 // 767 // This method is safe for calling concurrently with Run() and will cause it to 768 // exit (thus closing WaitCh). 769 // When the destroy action is completed, it will close DestroyCh(). 770 func (ar *allocRunner) Destroy() { 771 ar.destroyedLock.Lock() 772 defer ar.destroyedLock.Unlock() 773 774 if ar.destroyed { 775 // Only destroy once 776 return 777 } 778 779 if ar.destroyLaunched { 780 // Only dispatch a destroy once 781 return 782 } 783 784 ar.destroyLaunched = true 785 786 // Synchronize calls to shutdown/destroy 787 if ar.shutdownLaunched { 788 go func() { 789 ar.logger.Debug("Waiting for shutdown before destroying runner") 790 <-ar.shutdownCh 791 ar.destroyImpl() 792 }() 793 794 return 795 } 796 797 go ar.destroyImpl() 798 } 799 800 // IsDestroyed returns true if the alloc runner has been destroyed (stopped and 801 // garbage collected). 802 // 803 // This method is safe for calling concurrently with Run(). Callers must 804 // receive on WaitCh() to block until alloc runner has stopped and been 805 // destroyed. 806 func (ar *allocRunner) IsDestroyed() bool { 807 ar.destroyedLock.Lock() 808 defer ar.destroyedLock.Unlock() 809 return ar.destroyed 810 } 811 812 // IsWaiting returns true if the alloc runner is waiting for its previous 813 // allocation to terminate. 814 // 815 // This method is safe for calling concurrently with Run(). 816 func (ar *allocRunner) IsWaiting() bool { 817 return ar.prevAllocWatcher.IsWaiting() 818 } 819 820 // DestroyCh is a channel that is closed when an allocrunner is closed due to 821 // an explicit call to Destroy(). 822 func (ar *allocRunner) DestroyCh() <-chan struct{} { 823 return ar.destroyCh 824 } 825 826 // ShutdownCh is a channel that is closed when an allocrunner is closed due to 827 // either an explicit call to Shutdown(), or Destroy(). 828 func (ar *allocRunner) ShutdownCh() <-chan struct{} { 829 return ar.shutdownCh 830 } 831 832 // Shutdown AllocRunner gracefully. Asynchronously shuts down all TaskRunners. 833 // Tasks are unaffected and may be restored. 834 // When the destroy action is completed, it will close ShutdownCh(). 835 func (ar *allocRunner) Shutdown() { 836 ar.destroyedLock.Lock() 837 defer ar.destroyedLock.Unlock() 838 839 // Destroy is a superset of Shutdown so there's nothing to do if this 840 // has already been destroyed. 841 if ar.destroyed { 842 return 843 } 844 845 // Destroy is a superset of Shutdown so if it's been marked for destruction, 846 // don't try and shutdown in parallel. If shutdown has been launched, don't 847 // try again. 848 if ar.destroyLaunched || ar.shutdownLaunched { 849 return 850 } 851 852 ar.shutdownLaunched = true 853 854 go func() { 855 ar.logger.Trace("shutting down") 856 857 // Shutdown tasks gracefully if they were run 858 wg := sync.WaitGroup{} 859 for _, tr := range ar.tasks { 860 wg.Add(1) 861 go func(tr *taskrunner.TaskRunner) { 862 tr.Shutdown() 863 wg.Done() 864 }(tr) 865 } 866 wg.Wait() 867 868 // Wait for Run to exit 869 <-ar.waitCh 870 871 // Run shutdown hooks 872 ar.shutdownHooks() 873 874 // Wait for updater to finish its final run 875 <-ar.taskStateUpdateHandlerCh 876 877 ar.destroyedLock.Lock() 878 ar.shutdown = true 879 close(ar.shutdownCh) 880 ar.destroyedLock.Unlock() 881 }() 882 } 883 884 // IsMigrating returns true if the alloc runner is migrating data from its 885 // previous allocation. 886 // 887 // This method is safe for calling concurrently with Run(). 888 func (ar *allocRunner) IsMigrating() bool { 889 return ar.prevAllocMigrator.IsMigrating() 890 } 891 892 func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter { 893 return ar 894 } 895 896 // LatestAllocStats returns the latest stats for an allocation. If taskFilter 897 // is set, only stats for that task -- if it exists -- are returned. 898 func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) { 899 astat := &cstructs.AllocResourceUsage{ 900 Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)), 901 ResourceUsage: &cstructs.ResourceUsage{ 902 MemoryStats: &cstructs.MemoryStats{}, 903 CpuStats: &cstructs.CpuStats{}, 904 DeviceStats: []*device.DeviceGroupStats{}, 905 }, 906 } 907 908 for name, tr := range ar.tasks { 909 if taskFilter != "" && taskFilter != name { 910 // Getting stats for a particular task and its not this one! 911 continue 912 } 913 914 if usage := tr.LatestResourceUsage(); usage != nil { 915 astat.Tasks[name] = usage 916 astat.ResourceUsage.Add(usage.ResourceUsage) 917 if usage.Timestamp > astat.Timestamp { 918 astat.Timestamp = usage.Timestamp 919 } 920 } 921 } 922 923 return astat, nil 924 } 925 926 func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventHandler { 927 if tr, ok := ar.tasks[taskName]; ok { 928 return func(ev *drivers.TaskEvent) { 929 tr.EmitEvent(&structs.TaskEvent{ 930 Type: structs.TaskDriverMessage, 931 Time: ev.Timestamp.UnixNano(), 932 Details: ev.Annotations, 933 DriverMessage: ev.Message, 934 }) 935 } 936 } 937 return nil 938 }