github.com/bigcommerce/nomad@v0.9.3-bc/client/allocrunner/alloc_runner.go (about) 1 package allocrunner 2 3 import ( 4 "context" 5 "fmt" 6 "path/filepath" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 multierror "github.com/hashicorp/go-multierror" 12 "github.com/hashicorp/nomad/client/allocdir" 13 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 14 "github.com/hashicorp/nomad/client/allocrunner/state" 15 "github.com/hashicorp/nomad/client/allocrunner/taskrunner" 16 "github.com/hashicorp/nomad/client/allocwatcher" 17 "github.com/hashicorp/nomad/client/config" 18 "github.com/hashicorp/nomad/client/consul" 19 "github.com/hashicorp/nomad/client/devicemanager" 20 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 21 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 22 cstate "github.com/hashicorp/nomad/client/state" 23 cstructs "github.com/hashicorp/nomad/client/structs" 24 "github.com/hashicorp/nomad/client/vaultclient" 25 "github.com/hashicorp/nomad/helper" 26 "github.com/hashicorp/nomad/nomad/structs" 27 "github.com/hashicorp/nomad/plugins/device" 28 "github.com/hashicorp/nomad/plugins/drivers" 29 ) 30 31 // allocRunner is used to run all the tasks in a given allocation 32 type allocRunner struct { 33 // id is the ID of the allocation. Can be accessed without a lock 34 id string 35 36 // Logger is the logger for the alloc runner. 37 logger log.Logger 38 39 clientConfig *config.Config 40 41 // stateUpdater is used to emit updated alloc state 42 stateUpdater cinterfaces.AllocStateHandler 43 44 // taskStateUpdatedCh is ticked whenever task state as changed. Must 45 // have len==1 to allow nonblocking notification of state updates while 46 // the goroutine is already processing a previous update. 47 taskStateUpdatedCh chan struct{} 48 49 // taskStateUpdateHandlerCh is closed when the task state handling 50 // goroutine exits. It is unsafe to destroy the local allocation state 51 // before this goroutine exits. 52 taskStateUpdateHandlerCh chan struct{} 53 54 // allocUpdatedCh is a channel that is used to stream allocation updates into 55 // the allocUpdate handler. Must have len==1 to allow nonblocking notification 56 // of new allocation updates while the goroutine is processing a previous 57 // update. 58 allocUpdatedCh chan *structs.Allocation 59 60 // consulClient is the client used by the consul service hook for 61 // registering services and checks 62 consulClient consul.ConsulServiceAPI 63 64 // vaultClient is the used to manage Vault tokens 65 vaultClient vaultclient.VaultClient 66 67 // waitCh is closed when the Run loop has exited 68 waitCh chan struct{} 69 70 // destroyed is true when the Run loop has exited, postrun hooks have 71 // run, and alloc runner has been destroyed. Must acquire destroyedLock 72 // to access. 73 destroyed bool 74 75 // destroyCh is closed when the Run loop has exited, postrun hooks have 76 // run, and alloc runner has been destroyed. 77 destroyCh chan struct{} 78 79 // shutdown is true when the Run loop has exited, and shutdown hooks have 80 // run. Must acquire destroyedLock to access. 81 shutdown bool 82 83 // shutdownCh is closed when the Run loop has exited, and shutdown hooks 84 // have run. 85 shutdownCh chan struct{} 86 87 // destroyLaunched is true if Destroy has been called. Must acquire 88 // destroyedLock to access. 89 destroyLaunched bool 90 91 // shutdownLaunched is true if Shutdown has been called. Must acquire 92 // destroyedLock to access. 93 shutdownLaunched bool 94 95 // destroyedLock guards destroyed, destroyLaunched, shutdownLaunched, 96 // and serializes Shutdown/Destroy calls. 97 destroyedLock sync.Mutex 98 99 // Alloc captures the allocation being run. 100 alloc *structs.Allocation 101 allocLock sync.RWMutex 102 103 // state is the alloc runner's state 104 state *state.State 105 stateLock sync.RWMutex 106 107 stateDB cstate.StateDB 108 109 // allocDir is used to build the allocations directory structure. 110 allocDir *allocdir.AllocDir 111 112 // runnerHooks are alloc runner lifecycle hooks that should be run on state 113 // transistions. 114 runnerHooks []interfaces.RunnerHook 115 116 // tasks are the set of task runners 117 tasks map[string]*taskrunner.TaskRunner 118 119 // deviceStatsReporter is used to lookup resource usage for alloc devices 120 deviceStatsReporter cinterfaces.DeviceStatsReporter 121 122 // allocBroadcaster sends client allocation updates to all listeners 123 allocBroadcaster *cstructs.AllocBroadcaster 124 125 // prevAllocWatcher allows waiting for any previous or preempted allocations 126 // to exit 127 prevAllocWatcher allocwatcher.PrevAllocWatcher 128 129 // prevAllocMigrator allows the migration of a previous allocations alloc dir. 130 prevAllocMigrator allocwatcher.PrevAllocMigrator 131 132 // devicemanager is used to mount devices as well as lookup device 133 // statistics 134 devicemanager devicemanager.Manager 135 136 // driverManager is responsible for dispensing driver plugins and registering 137 // event handlers 138 driverManager drivermanager.Manager 139 140 // serversContactedCh is passed to TaskRunners so they can detect when 141 // servers have been contacted for the first time in case of a failed 142 // restore. 143 serversContactedCh chan struct{} 144 } 145 146 // NewAllocRunner returns a new allocation runner. 147 func NewAllocRunner(config *Config) (*allocRunner, error) { 148 alloc := config.Alloc 149 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 150 if tg == nil { 151 return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup) 152 } 153 154 ar := &allocRunner{ 155 id: alloc.ID, 156 alloc: alloc, 157 clientConfig: config.ClientConfig, 158 consulClient: config.Consul, 159 vaultClient: config.Vault, 160 tasks: make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)), 161 waitCh: make(chan struct{}), 162 destroyCh: make(chan struct{}), 163 shutdownCh: make(chan struct{}), 164 state: &state.State{}, 165 stateDB: config.StateDB, 166 stateUpdater: config.StateUpdater, 167 taskStateUpdatedCh: make(chan struct{}, 1), 168 taskStateUpdateHandlerCh: make(chan struct{}), 169 allocUpdatedCh: make(chan *structs.Allocation, 1), 170 deviceStatsReporter: config.DeviceStatsReporter, 171 prevAllocWatcher: config.PrevAllocWatcher, 172 prevAllocMigrator: config.PrevAllocMigrator, 173 devicemanager: config.DeviceManager, 174 driverManager: config.DriverManager, 175 serversContactedCh: config.ServersContactedCh, 176 } 177 178 // Create the logger based on the allocation ID 179 ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID) 180 181 // Create alloc broadcaster 182 ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger) 183 184 // Create alloc dir 185 ar.allocDir = allocdir.NewAllocDir(ar.logger, filepath.Join(config.ClientConfig.AllocDir, alloc.ID)) 186 187 // Initialize the runners hooks. 188 ar.initRunnerHooks() 189 190 // Create the TaskRunners 191 if err := ar.initTaskRunners(tg.Tasks); err != nil { 192 return nil, err 193 } 194 195 return ar, nil 196 } 197 198 // initTaskRunners creates task runners but does *not* run them. 199 func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error { 200 for _, task := range tasks { 201 config := &taskrunner.Config{ 202 Alloc: ar.alloc, 203 ClientConfig: ar.clientConfig, 204 Task: task, 205 TaskDir: ar.allocDir.NewTaskDir(task.Name), 206 Logger: ar.logger, 207 StateDB: ar.stateDB, 208 StateUpdater: ar, 209 Consul: ar.consulClient, 210 Vault: ar.vaultClient, 211 DeviceStatsReporter: ar.deviceStatsReporter, 212 DeviceManager: ar.devicemanager, 213 DriverManager: ar.driverManager, 214 ServersContactedCh: ar.serversContactedCh, 215 } 216 217 // Create, but do not Run, the task runner 218 tr, err := taskrunner.NewTaskRunner(config) 219 if err != nil { 220 return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err) 221 } 222 223 ar.tasks[task.Name] = tr 224 } 225 return nil 226 } 227 228 func (ar *allocRunner) WaitCh() <-chan struct{} { 229 return ar.waitCh 230 } 231 232 // Run the AllocRunner. Starts tasks if the alloc is non-terminal and closes 233 // WaitCh when it exits. Should be started in a goroutine. 234 func (ar *allocRunner) Run() { 235 // Close the wait channel on return 236 defer close(ar.waitCh) 237 238 // Start the task state update handler 239 go ar.handleTaskStateUpdates() 240 241 // Start the alloc update handler 242 go ar.handleAllocUpdates() 243 244 // If task update chan has been closed, that means we've been shutdown. 245 select { 246 case <-ar.taskStateUpdateHandlerCh: 247 return 248 default: 249 } 250 251 // When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run 252 // to perform any cleanup that's necessary, potentially not done prior to earlier termination 253 254 // Run the prestart hooks if non-terminal 255 if ar.shouldRun() { 256 if err := ar.prerun(); err != nil { 257 ar.logger.Error("prerun failed", "error", err) 258 259 for _, tr := range ar.tasks { 260 tr.MarkFailedDead(fmt.Sprintf("failed to setup runner: %v", err)) 261 } 262 263 goto POST 264 } 265 } 266 267 // Run the runners (blocks until they exit) 268 ar.runTasks() 269 270 POST: 271 // Run the postrun hooks 272 if err := ar.postrun(); err != nil { 273 ar.logger.Error("postrun failed", "error", err) 274 } 275 276 } 277 278 // shouldRun returns true if the alloc is in a state that the alloc runner 279 // should run it. 280 func (ar *allocRunner) shouldRun() bool { 281 // Do not run allocs that are terminal 282 if ar.Alloc().TerminalStatus() { 283 ar.logger.Trace("alloc terminal; not running", 284 "desired_status", ar.Alloc().DesiredStatus, 285 "client_status", ar.Alloc().ClientStatus, 286 ) 287 return false 288 } 289 290 // It's possible that the alloc local state was marked terminal before 291 // the server copy of the alloc (checked above) was marked as terminal, 292 // so check the local state as well. 293 switch clientStatus := ar.AllocState().ClientStatus; clientStatus { 294 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost: 295 ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus) 296 return false 297 } 298 299 return true 300 } 301 302 // runTasks is used to run the task runners and block until they exit. 303 func (ar *allocRunner) runTasks() { 304 for _, task := range ar.tasks { 305 go task.Run() 306 } 307 308 for _, task := range ar.tasks { 309 <-task.WaitCh() 310 } 311 } 312 313 // Alloc returns the current allocation being run by this runner as sent by the 314 // server. This view of the allocation does not have updated task states. 315 func (ar *allocRunner) Alloc() *structs.Allocation { 316 ar.allocLock.RLock() 317 defer ar.allocLock.RUnlock() 318 return ar.alloc 319 } 320 321 func (ar *allocRunner) setAlloc(updated *structs.Allocation) { 322 ar.allocLock.Lock() 323 ar.alloc = updated 324 ar.allocLock.Unlock() 325 } 326 327 // GetAllocDir returns the alloc dir which is safe for concurrent use. 328 func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir { 329 return ar.allocDir 330 } 331 332 // Restore state from database. Must be called after NewAllocRunner but before 333 // Run. 334 func (ar *allocRunner) Restore() error { 335 // Retrieve deployment status to avoid reseting it across agent 336 // restarts. Once a deployment status is set Nomad no longer monitors 337 // alloc health, so we must persist deployment state across restarts. 338 ds, err := ar.stateDB.GetDeploymentStatus(ar.id) 339 if err != nil { 340 return err 341 } 342 343 ar.stateLock.Lock() 344 ar.state.DeploymentStatus = ds 345 ar.stateLock.Unlock() 346 347 // Restore task runners 348 for _, tr := range ar.tasks { 349 if err := tr.Restore(); err != nil { 350 return err 351 } 352 } 353 354 return nil 355 } 356 357 // persistDeploymentStatus stores AllocDeploymentStatus. 358 func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) { 359 if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil { 360 // While any persistence errors are very bad, the worst case 361 // scenario for failing to persist deployment status is that if 362 // the agent is restarted it will monitor the deployment status 363 // again. This could cause a deployment's status to change when 364 // that shouldn't happen. However, allowing that seems better 365 // than failing the entire allocation. 366 ar.logger.Error("error storing deployment status", "error", err) 367 } 368 } 369 370 // TaskStateUpdated is called by TaskRunner when a task's state has been 371 // updated. It does not process the update synchronously but instead notifies a 372 // goroutine the state has change. Since processing the state change may cause 373 // the task to be killed (thus change its state again) it cannot be done 374 // synchronously as it would cause a deadlock due to reentrancy. 375 // 376 // The goroutine is used to compute changes to the alloc's ClientStatus and to 377 // update the server with the new state. 378 func (ar *allocRunner) TaskStateUpdated() { 379 select { 380 case ar.taskStateUpdatedCh <- struct{}{}: 381 default: 382 // already pending updates 383 } 384 } 385 386 // handleTaskStateUpdates must be run in goroutine as it monitors 387 // taskStateUpdatedCh for task state update notifications and processes task 388 // states. 389 // 390 // Processing task state updates must be done in a goroutine as it may have to 391 // kill tasks which causes further task state updates. 392 func (ar *allocRunner) handleTaskStateUpdates() { 393 defer close(ar.taskStateUpdateHandlerCh) 394 395 for done := false; !done; { 396 select { 397 case <-ar.taskStateUpdatedCh: 398 case <-ar.waitCh: 399 // Run has exited, sync once more to ensure final 400 // states are collected. 401 done = true 402 } 403 404 ar.logger.Trace("handling task state update", "done", done) 405 406 // Set with the appropriate event if task runners should be 407 // killed. 408 var killEvent *structs.TaskEvent 409 410 // If task runners should be killed, this is set to the task 411 // name whose fault it is. 412 killTask := "" 413 414 // True if task runners should be killed because a leader 415 // failed (informational). 416 leaderFailed := false 417 418 // Task state has been updated; gather the state of the other tasks 419 trNum := len(ar.tasks) 420 liveRunners := make([]*taskrunner.TaskRunner, 0, trNum) 421 states := make(map[string]*structs.TaskState, trNum) 422 423 for name, tr := range ar.tasks { 424 state := tr.TaskState() 425 states[name] = state 426 427 // Capture live task runners in case we need to kill them 428 if state.State != structs.TaskStateDead { 429 liveRunners = append(liveRunners, tr) 430 continue 431 } 432 433 // Task is dead, determine if other tasks should be killed 434 if state.Failed { 435 // Only set failed event if no event has been 436 // set yet to give dead leaders priority. 437 if killEvent == nil { 438 killTask = name 439 killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed). 440 SetFailedSibling(name) 441 } 442 } else if tr.IsLeader() { 443 killEvent = structs.NewTaskEvent(structs.TaskLeaderDead) 444 leaderFailed = true 445 killTask = name 446 } 447 } 448 449 // If there's a kill event set and live runners, kill them 450 if killEvent != nil && len(liveRunners) > 0 { 451 452 // Log kill reason 453 if leaderFailed { 454 ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask) 455 } else { 456 ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask) 457 } 458 459 // Emit kill event for live runners 460 for _, tr := range liveRunners { 461 tr.EmitEvent(killEvent) 462 } 463 464 // Kill 'em all 465 states = ar.killTasks() 466 467 // Wait for TaskRunners to exit before continuing to 468 // prevent looping before TaskRunners have transitioned 469 // to Dead. 470 for _, tr := range liveRunners { 471 select { 472 case <-tr.WaitCh(): 473 case <-ar.waitCh: 474 } 475 } 476 } 477 478 // Get the client allocation 479 calloc := ar.clientAlloc(states) 480 481 // Update the server 482 ar.stateUpdater.AllocStateUpdated(calloc) 483 484 // Broadcast client alloc to listeners 485 ar.allocBroadcaster.Send(calloc) 486 } 487 } 488 489 // killTasks kills all task runners, leader (if there is one) first. Errors are 490 // logged except taskrunner.ErrTaskNotRunning which is ignored. Task states 491 // after Kill has been called are returned. 492 func (ar *allocRunner) killTasks() map[string]*structs.TaskState { 493 var mu sync.Mutex 494 states := make(map[string]*structs.TaskState, len(ar.tasks)) 495 496 // Kill leader first, synchronously 497 for name, tr := range ar.tasks { 498 if !tr.IsLeader() { 499 continue 500 } 501 502 err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilling)) 503 if err != nil && err != taskrunner.ErrTaskNotRunning { 504 ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) 505 } 506 507 state := tr.TaskState() 508 states[name] = state 509 break 510 } 511 512 // Kill the rest concurrently 513 wg := sync.WaitGroup{} 514 for name, tr := range ar.tasks { 515 if tr.IsLeader() { 516 continue 517 } 518 519 wg.Add(1) 520 go func(name string, tr *taskrunner.TaskRunner) { 521 defer wg.Done() 522 err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilling)) 523 if err != nil && err != taskrunner.ErrTaskNotRunning { 524 ar.logger.Warn("error stopping task", "error", err, "task_name", name) 525 } 526 527 state := tr.TaskState() 528 mu.Lock() 529 states[name] = state 530 mu.Unlock() 531 }(name, tr) 532 } 533 wg.Wait() 534 535 return states 536 } 537 538 // clientAlloc takes in the task states and returns an Allocation populated 539 // with Client specific fields 540 func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation { 541 ar.stateLock.Lock() 542 defer ar.stateLock.Unlock() 543 544 // store task states for AllocState to expose 545 ar.state.TaskStates = taskStates 546 547 a := &structs.Allocation{ 548 ID: ar.id, 549 TaskStates: taskStates, 550 } 551 552 if d := ar.state.DeploymentStatus; d != nil { 553 a.DeploymentStatus = d.Copy() 554 } 555 556 // Compute the ClientStatus 557 if ar.state.ClientStatus != "" { 558 // The client status is being forced 559 a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription 560 } else { 561 a.ClientStatus, a.ClientDescription = getClientStatus(taskStates) 562 } 563 564 // If the allocation is terminal, make sure all required fields are properly 565 // set. 566 if a.ClientTerminalStatus() { 567 alloc := ar.Alloc() 568 569 // If we are part of a deployment and the alloc has failed, mark the 570 // alloc as unhealthy. This guards against the watcher not be started. 571 // If the health status is already set then terminal allocations should not 572 if a.ClientStatus == structs.AllocClientStatusFailed && 573 alloc.DeploymentID != "" && !a.DeploymentStatus.HasHealth() { 574 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 575 Healthy: helper.BoolToPtr(false), 576 } 577 } 578 579 // Make sure we have marked the finished at for every task. This is used 580 // to calculate the reschedule time for failed allocations. 581 now := time.Now() 582 for taskName := range ar.tasks { 583 ts, ok := a.TaskStates[taskName] 584 if !ok { 585 ts = &structs.TaskState{} 586 a.TaskStates[taskName] = ts 587 } 588 if ts.FinishedAt.IsZero() { 589 ts.FinishedAt = now 590 } 591 } 592 } 593 594 return a 595 } 596 597 // getClientStatus takes in the task states for a given allocation and computes 598 // the client status and description 599 func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) { 600 var pending, running, dead, failed bool 601 for _, state := range taskStates { 602 switch state.State { 603 case structs.TaskStateRunning: 604 running = true 605 case structs.TaskStatePending: 606 pending = true 607 case structs.TaskStateDead: 608 if state.Failed { 609 failed = true 610 } else { 611 dead = true 612 } 613 } 614 } 615 616 // Determine the alloc status 617 if failed { 618 return structs.AllocClientStatusFailed, "Failed tasks" 619 } else if running { 620 return structs.AllocClientStatusRunning, "Tasks are running" 621 } else if pending { 622 return structs.AllocClientStatusPending, "No tasks have started" 623 } else if dead { 624 return structs.AllocClientStatusComplete, "All tasks have completed" 625 } 626 627 return "", "" 628 } 629 630 // SetClientStatus is a helper for forcing a specific client 631 // status on the alloc runner. This is used during restore errors 632 // when the task state can't be restored. 633 func (ar *allocRunner) SetClientStatus(clientStatus string) { 634 ar.stateLock.Lock() 635 defer ar.stateLock.Unlock() 636 ar.state.ClientStatus = clientStatus 637 } 638 639 // AllocState returns a copy of allocation state including a snapshot of task 640 // states. 641 func (ar *allocRunner) AllocState() *state.State { 642 ar.stateLock.RLock() 643 state := ar.state.Copy() 644 ar.stateLock.RUnlock() 645 646 // If TaskStateUpdated has not been called yet, ar.state.TaskStates 647 // won't be set as it is not the canonical source of TaskStates. 648 if len(state.TaskStates) == 0 { 649 ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks)) 650 for k, tr := range ar.tasks { 651 state.TaskStates[k] = tr.TaskState() 652 } 653 } 654 655 // Generate alloc to get other state fields 656 alloc := ar.clientAlloc(state.TaskStates) 657 state.ClientStatus = alloc.ClientStatus 658 state.ClientDescription = alloc.ClientDescription 659 state.DeploymentStatus = alloc.DeploymentStatus 660 661 return state 662 } 663 664 // Update asyncronously updates the running allocation with a new version 665 // received from the server. 666 // When processing a new update, we will first attempt to drain stale updates 667 // from the queue, before appending the new one. 668 func (ar *allocRunner) Update(update *structs.Allocation) { 669 select { 670 // Drain queued update from the channel if possible, and check the modify 671 // index 672 case oldUpdate := <-ar.allocUpdatedCh: 673 // If the old update is newer than the replacement, then skip the new one 674 // and return. This case shouldn't happen, but may in the case of a bug 675 // elsewhere inside the system. 676 if oldUpdate.AllocModifyIndex > update.AllocModifyIndex { 677 ar.logger.Debug("Discarding allocation update due to newer alloc revision in queue", 678 "old_modify_index", oldUpdate.AllocModifyIndex, 679 "new_modify_index", update.AllocModifyIndex) 680 ar.allocUpdatedCh <- oldUpdate 681 return 682 } else { 683 ar.logger.Debug("Discarding allocation update", 684 "skipped_modify_index", oldUpdate.AllocModifyIndex, 685 "new_modify_index", update.AllocModifyIndex) 686 } 687 case <-ar.waitCh: 688 ar.logger.Trace("AllocRunner has terminated, skipping alloc update", 689 "modify_index", update.AllocModifyIndex) 690 return 691 default: 692 } 693 694 // Queue the new update 695 ar.allocUpdatedCh <- update 696 } 697 698 func (ar *allocRunner) handleAllocUpdates() { 699 for { 700 select { 701 case update := <-ar.allocUpdatedCh: 702 ar.handleAllocUpdate(update) 703 case <-ar.waitCh: 704 return 705 } 706 } 707 } 708 709 // This method sends the updated alloc to Run for serially processing updates. 710 // If there is already a pending update it will be discarded and replaced by 711 // the latest update. 712 func (ar *allocRunner) handleAllocUpdate(update *structs.Allocation) { 713 // Detect Stop updates 714 stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus() 715 716 // Update ar.alloc 717 ar.setAlloc(update) 718 719 // Run update hooks if not stopping or dead 720 if !update.TerminalStatus() { 721 if err := ar.update(update); err != nil { 722 ar.logger.Error("error running update hooks", "error", err) 723 } 724 725 } 726 727 // Update task runners 728 for _, tr := range ar.tasks { 729 tr.Update(update) 730 } 731 732 // If alloc is being terminated, kill all tasks, leader first 733 if stopping { 734 ar.killTasks() 735 } 736 737 } 738 739 func (ar *allocRunner) Listener() *cstructs.AllocListener { 740 return ar.allocBroadcaster.Listen() 741 } 742 743 func (ar *allocRunner) destroyImpl() { 744 // Stop any running tasks and persist states in case the client is 745 // shutdown before Destroy finishes. 746 states := ar.killTasks() 747 calloc := ar.clientAlloc(states) 748 ar.stateUpdater.AllocStateUpdated(calloc) 749 750 // Wait for tasks to exit and postrun hooks to finish 751 <-ar.waitCh 752 753 // Run destroy hooks 754 if err := ar.destroy(); err != nil { 755 ar.logger.Warn("error running destroy hooks", "error", err) 756 } 757 758 // Wait for task state update handler to exit before removing local 759 // state if Run() ran at all. 760 <-ar.taskStateUpdateHandlerCh 761 762 // Cleanup state db 763 if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil { 764 ar.logger.Warn("failed to delete allocation state", "error", err) 765 } 766 767 // Mark alloc as destroyed 768 ar.destroyedLock.Lock() 769 770 if !ar.shutdown { 771 ar.shutdown = true 772 close(ar.shutdownCh) 773 } 774 775 ar.destroyed = true 776 close(ar.destroyCh) 777 778 ar.destroyedLock.Unlock() 779 } 780 781 // Destroy the alloc runner by stopping it if it is still running and cleaning 782 // up all of its resources. 783 // 784 // This method is safe for calling concurrently with Run() and will cause it to 785 // exit (thus closing WaitCh). 786 // When the destroy action is completed, it will close DestroyCh(). 787 func (ar *allocRunner) Destroy() { 788 ar.destroyedLock.Lock() 789 defer ar.destroyedLock.Unlock() 790 791 if ar.destroyed { 792 // Only destroy once 793 return 794 } 795 796 if ar.destroyLaunched { 797 // Only dispatch a destroy once 798 return 799 } 800 801 ar.destroyLaunched = true 802 803 // Synchronize calls to shutdown/destroy 804 if ar.shutdownLaunched { 805 go func() { 806 ar.logger.Debug("Waiting for shutdown before destroying runner") 807 <-ar.shutdownCh 808 ar.destroyImpl() 809 }() 810 811 return 812 } 813 814 go ar.destroyImpl() 815 } 816 817 // IsDestroyed returns true if the alloc runner has been destroyed (stopped and 818 // garbage collected). 819 // 820 // This method is safe for calling concurrently with Run(). Callers must 821 // receive on WaitCh() to block until alloc runner has stopped and been 822 // destroyed. 823 func (ar *allocRunner) IsDestroyed() bool { 824 ar.destroyedLock.Lock() 825 defer ar.destroyedLock.Unlock() 826 return ar.destroyed 827 } 828 829 // IsWaiting returns true if the alloc runner is waiting for its previous 830 // allocation to terminate. 831 // 832 // This method is safe for calling concurrently with Run(). 833 func (ar *allocRunner) IsWaiting() bool { 834 return ar.prevAllocWatcher.IsWaiting() 835 } 836 837 // DestroyCh is a channel that is closed when an allocrunner is closed due to 838 // an explicit call to Destroy(). 839 func (ar *allocRunner) DestroyCh() <-chan struct{} { 840 return ar.destroyCh 841 } 842 843 // ShutdownCh is a channel that is closed when an allocrunner is closed due to 844 // either an explicit call to Shutdown(), or Destroy(). 845 func (ar *allocRunner) ShutdownCh() <-chan struct{} { 846 return ar.shutdownCh 847 } 848 849 // Shutdown AllocRunner gracefully. Asynchronously shuts down all TaskRunners. 850 // Tasks are unaffected and may be restored. 851 // When the destroy action is completed, it will close ShutdownCh(). 852 func (ar *allocRunner) Shutdown() { 853 ar.destroyedLock.Lock() 854 defer ar.destroyedLock.Unlock() 855 856 // Destroy is a superset of Shutdown so there's nothing to do if this 857 // has already been destroyed. 858 if ar.destroyed { 859 return 860 } 861 862 // Destroy is a superset of Shutdown so if it's been marked for destruction, 863 // don't try and shutdown in parallel. If shutdown has been launched, don't 864 // try again. 865 if ar.destroyLaunched || ar.shutdownLaunched { 866 return 867 } 868 869 ar.shutdownLaunched = true 870 871 go func() { 872 ar.logger.Trace("shutting down") 873 874 // Shutdown tasks gracefully if they were run 875 wg := sync.WaitGroup{} 876 for _, tr := range ar.tasks { 877 wg.Add(1) 878 go func(tr *taskrunner.TaskRunner) { 879 tr.Shutdown() 880 wg.Done() 881 }(tr) 882 } 883 wg.Wait() 884 885 // Wait for Run to exit 886 <-ar.waitCh 887 888 // Run shutdown hooks 889 ar.shutdownHooks() 890 891 // Wait for updater to finish its final run 892 <-ar.taskStateUpdateHandlerCh 893 894 ar.destroyedLock.Lock() 895 ar.shutdown = true 896 close(ar.shutdownCh) 897 ar.destroyedLock.Unlock() 898 }() 899 } 900 901 // IsMigrating returns true if the alloc runner is migrating data from its 902 // previous allocation. 903 // 904 // This method is safe for calling concurrently with Run(). 905 func (ar *allocRunner) IsMigrating() bool { 906 return ar.prevAllocMigrator.IsMigrating() 907 } 908 909 func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter { 910 return ar 911 } 912 913 // LatestAllocStats returns the latest stats for an allocation. If taskFilter 914 // is set, only stats for that task -- if it exists -- are returned. 915 func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) { 916 astat := &cstructs.AllocResourceUsage{ 917 Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)), 918 ResourceUsage: &cstructs.ResourceUsage{ 919 MemoryStats: &cstructs.MemoryStats{}, 920 CpuStats: &cstructs.CpuStats{}, 921 DeviceStats: []*device.DeviceGroupStats{}, 922 }, 923 } 924 925 for name, tr := range ar.tasks { 926 if taskFilter != "" && taskFilter != name { 927 // Getting stats for a particular task and its not this one! 928 continue 929 } 930 931 if usage := tr.LatestResourceUsage(); usage != nil { 932 astat.Tasks[name] = usage 933 astat.ResourceUsage.Add(usage.ResourceUsage) 934 if usage.Timestamp > astat.Timestamp { 935 astat.Timestamp = usage.Timestamp 936 } 937 } 938 } 939 940 return astat, nil 941 } 942 943 func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventHandler { 944 if tr, ok := ar.tasks[taskName]; ok { 945 return func(ev *drivers.TaskEvent) { 946 tr.EmitEvent(&structs.TaskEvent{ 947 Type: structs.TaskDriverMessage, 948 Time: ev.Timestamp.UnixNano(), 949 Details: ev.Annotations, 950 DriverMessage: ev.Message, 951 }) 952 } 953 } 954 return nil 955 } 956 957 // RestartTask signalls the task runner for the provided task to restart. 958 func (ar *allocRunner) RestartTask(taskName string, taskEvent *structs.TaskEvent) error { 959 tr, ok := ar.tasks[taskName] 960 if !ok { 961 return fmt.Errorf("Could not find task runner for task: %s", taskName) 962 } 963 964 return tr.Restart(context.TODO(), taskEvent, false) 965 } 966 967 // RestartAll signalls all task runners in the allocation to restart and passes 968 // a copy of the task event to each restart event. 969 // Returns any errors in a concatenated form. 970 func (ar *allocRunner) RestartAll(taskEvent *structs.TaskEvent) error { 971 var err *multierror.Error 972 973 for tn := range ar.tasks { 974 rerr := ar.RestartTask(tn, taskEvent.Copy()) 975 if rerr != nil { 976 err = multierror.Append(err, rerr) 977 } 978 } 979 980 return err.ErrorOrNil() 981 } 982 983 // Signal sends a signal request to task runners inside an allocation. If the 984 // taskName is empty, then it is sent to all tasks. 985 func (ar *allocRunner) Signal(taskName, signal string) error { 986 event := structs.NewTaskEvent(structs.TaskSignaling).SetSignalText(signal) 987 988 if taskName != "" { 989 tr, ok := ar.tasks[taskName] 990 if !ok { 991 return fmt.Errorf("Task not found") 992 } 993 994 return tr.Signal(event, signal) 995 } 996 997 var err *multierror.Error 998 999 for tn, tr := range ar.tasks { 1000 rerr := tr.Signal(event.Copy(), signal) 1001 if rerr != nil { 1002 err = multierror.Append(err, fmt.Errorf("Failed to signal task: %s, err: %v", tn, rerr)) 1003 } 1004 } 1005 1006 return err.ErrorOrNil() 1007 } 1008 1009 func (ar *allocRunner) GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler { 1010 tr, ok := ar.tasks[taskName] 1011 if !ok { 1012 return nil 1013 } 1014 1015 return tr.TaskExecHandler() 1016 } 1017 1018 func (ar *allocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) { 1019 tr, ok := ar.tasks[taskName] 1020 if !ok { 1021 return nil, fmt.Errorf("task not found") 1022 } 1023 1024 return tr.DriverCapabilities() 1025 }