github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/client/allocrunner/alloc_runner.go (about) 1 package allocrunner 2 3 import ( 4 "context" 5 "fmt" 6 "path/filepath" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 multierror "github.com/hashicorp/go-multierror" 12 "github.com/hashicorp/nomad/client/allocdir" 13 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 14 "github.com/hashicorp/nomad/client/allocrunner/state" 15 "github.com/hashicorp/nomad/client/allocrunner/taskrunner" 16 "github.com/hashicorp/nomad/client/allocwatcher" 17 "github.com/hashicorp/nomad/client/config" 18 "github.com/hashicorp/nomad/client/consul" 19 "github.com/hashicorp/nomad/client/devicemanager" 20 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 21 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 22 cstate "github.com/hashicorp/nomad/client/state" 23 cstructs "github.com/hashicorp/nomad/client/structs" 24 "github.com/hashicorp/nomad/client/vaultclient" 25 "github.com/hashicorp/nomad/helper" 26 "github.com/hashicorp/nomad/nomad/structs" 27 "github.com/hashicorp/nomad/plugins/device" 28 "github.com/hashicorp/nomad/plugins/drivers" 29 ) 30 31 // allocRunner is used to run all the tasks in a given allocation 32 type allocRunner struct { 33 // id is the ID of the allocation. Can be accessed without a lock 34 id string 35 36 // Logger is the logger for the alloc runner. 37 logger log.Logger 38 39 clientConfig *config.Config 40 41 // stateUpdater is used to emit updated alloc state 42 stateUpdater cinterfaces.AllocStateHandler 43 44 // taskStateUpdatedCh is ticked whenever task state as changed. Must 45 // have len==1 to allow nonblocking notification of state updates while 46 // the goroutine is already processing a previous update. 47 taskStateUpdatedCh chan struct{} 48 49 // taskStateUpdateHandlerCh is closed when the task state handling 50 // goroutine exits. It is unsafe to destroy the local allocation state 51 // before this goroutine exits. 52 taskStateUpdateHandlerCh chan struct{} 53 54 // allocUpdatedCh is a channel that is used to stream allocation updates into 55 // the allocUpdate handler. Must have len==1 to allow nonblocking notification 56 // of new allocation updates while the goroutine is processing a previous 57 // update. 58 allocUpdatedCh chan *structs.Allocation 59 60 // consulClient is the client used by the consul service hook for 61 // registering services and checks 62 consulClient consul.ConsulServiceAPI 63 64 // vaultClient is the used to manage Vault tokens 65 vaultClient vaultclient.VaultClient 66 67 // waitCh is closed when the Run loop has exited 68 waitCh chan struct{} 69 70 // destroyed is true when the Run loop has exited, postrun hooks have 71 // run, and alloc runner has been destroyed. Must acquire destroyedLock 72 // to access. 73 destroyed bool 74 75 // destroyCh is closed when the Run loop has exited, postrun hooks have 76 // run, and alloc runner has been destroyed. 77 destroyCh chan struct{} 78 79 // shutdown is true when the Run loop has exited, and shutdown hooks have 80 // run. Must acquire destroyedLock to access. 81 shutdown bool 82 83 // shutdownCh is closed when the Run loop has exited, and shutdown hooks 84 // have run. 85 shutdownCh chan struct{} 86 87 // destroyLaunched is true if Destroy has been called. Must acquire 88 // destroyedLock to access. 89 destroyLaunched bool 90 91 // shutdownLaunched is true if Shutdown has been called. Must acquire 92 // destroyedLock to access. 93 shutdownLaunched bool 94 95 // destroyedLock guards destroyed, destroyLaunched, shutdownLaunched, 96 // and serializes Shutdown/Destroy calls. 97 destroyedLock sync.Mutex 98 99 // Alloc captures the allocation being run. 100 alloc *structs.Allocation 101 allocLock sync.RWMutex 102 103 // state is the alloc runner's state 104 state *state.State 105 stateLock sync.RWMutex 106 107 stateDB cstate.StateDB 108 109 // allocDir is used to build the allocations directory structure. 110 allocDir *allocdir.AllocDir 111 112 // runnerHooks are alloc runner lifecycle hooks that should be run on state 113 // transistions. 114 runnerHooks []interfaces.RunnerHook 115 116 // tasks are the set of task runners 117 tasks map[string]*taskrunner.TaskRunner 118 119 // deviceStatsReporter is used to lookup resource usage for alloc devices 120 deviceStatsReporter cinterfaces.DeviceStatsReporter 121 122 // allocBroadcaster sends client allocation updates to all listeners 123 allocBroadcaster *cstructs.AllocBroadcaster 124 125 // prevAllocWatcher allows waiting for any previous or preempted allocations 126 // to exit 127 prevAllocWatcher allocwatcher.PrevAllocWatcher 128 129 // prevAllocMigrator allows the migration of a previous allocations alloc dir. 130 prevAllocMigrator allocwatcher.PrevAllocMigrator 131 132 // devicemanager is used to mount devices as well as lookup device 133 // statistics 134 devicemanager devicemanager.Manager 135 136 // driverManager is responsible for dispensing driver plugins and registering 137 // event handlers 138 driverManager drivermanager.Manager 139 140 // serversContactedCh is passed to TaskRunners so they can detect when 141 // servers have been contacted for the first time in case of a failed 142 // restore. 143 serversContactedCh chan struct{} 144 } 145 146 // NewAllocRunner returns a new allocation runner. 147 func NewAllocRunner(config *Config) (*allocRunner, error) { 148 alloc := config.Alloc 149 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 150 if tg == nil { 151 return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup) 152 } 153 154 ar := &allocRunner{ 155 id: alloc.ID, 156 alloc: alloc, 157 clientConfig: config.ClientConfig, 158 consulClient: config.Consul, 159 vaultClient: config.Vault, 160 tasks: make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)), 161 waitCh: make(chan struct{}), 162 destroyCh: make(chan struct{}), 163 shutdownCh: make(chan struct{}), 164 state: &state.State{}, 165 stateDB: config.StateDB, 166 stateUpdater: config.StateUpdater, 167 taskStateUpdatedCh: make(chan struct{}, 1), 168 taskStateUpdateHandlerCh: make(chan struct{}), 169 allocUpdatedCh: make(chan *structs.Allocation, 1), 170 deviceStatsReporter: config.DeviceStatsReporter, 171 prevAllocWatcher: config.PrevAllocWatcher, 172 prevAllocMigrator: config.PrevAllocMigrator, 173 devicemanager: config.DeviceManager, 174 driverManager: config.DriverManager, 175 serversContactedCh: config.ServersContactedCh, 176 } 177 178 // Create the logger based on the allocation ID 179 ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID) 180 181 // Create alloc broadcaster 182 ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger) 183 184 // Create alloc dir 185 ar.allocDir = allocdir.NewAllocDir(ar.logger, filepath.Join(config.ClientConfig.AllocDir, alloc.ID)) 186 187 // Initialize the runners hooks. 188 ar.initRunnerHooks() 189 190 // Create the TaskRunners 191 if err := ar.initTaskRunners(tg.Tasks); err != nil { 192 return nil, err 193 } 194 195 return ar, nil 196 } 197 198 // initTaskRunners creates task runners but does *not* run them. 199 func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error { 200 for _, task := range tasks { 201 config := &taskrunner.Config{ 202 Alloc: ar.alloc, 203 ClientConfig: ar.clientConfig, 204 Task: task, 205 TaskDir: ar.allocDir.NewTaskDir(task.Name), 206 Logger: ar.logger, 207 StateDB: ar.stateDB, 208 StateUpdater: ar, 209 Consul: ar.consulClient, 210 Vault: ar.vaultClient, 211 DeviceStatsReporter: ar.deviceStatsReporter, 212 DeviceManager: ar.devicemanager, 213 DriverManager: ar.driverManager, 214 ServersContactedCh: ar.serversContactedCh, 215 } 216 217 // Create, but do not Run, the task runner 218 tr, err := taskrunner.NewTaskRunner(config) 219 if err != nil { 220 return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err) 221 } 222 223 ar.tasks[task.Name] = tr 224 } 225 return nil 226 } 227 228 func (ar *allocRunner) WaitCh() <-chan struct{} { 229 return ar.waitCh 230 } 231 232 // Run the AllocRunner. Starts tasks if the alloc is non-terminal and closes 233 // WaitCh when it exits. Should be started in a goroutine. 234 func (ar *allocRunner) Run() { 235 // Close the wait channel on return 236 defer close(ar.waitCh) 237 238 // Start the task state update handler 239 go ar.handleTaskStateUpdates() 240 241 // Start the alloc update handler 242 go ar.handleAllocUpdates() 243 244 // If task update chan has been closed, that means we've been shutdown. 245 select { 246 case <-ar.taskStateUpdateHandlerCh: 247 return 248 default: 249 } 250 251 // Run the prestart hooks if non-terminal 252 if ar.shouldRun() { 253 if err := ar.prerun(); err != nil { 254 ar.logger.Error("prerun failed", "error", err) 255 goto POST 256 } 257 } 258 259 // Run the runners (blocks until they exit) 260 ar.runTasks() 261 262 POST: 263 // Run the postrun hooks 264 if err := ar.postrun(); err != nil { 265 ar.logger.Error("postrun failed", "error", err) 266 } 267 268 } 269 270 // shouldRun returns true if the alloc is in a state that the alloc runner 271 // should run it. 272 func (ar *allocRunner) shouldRun() bool { 273 // Do not run allocs that are terminal 274 if ar.Alloc().TerminalStatus() { 275 ar.logger.Trace("alloc terminal; not running", 276 "desired_status", ar.Alloc().DesiredStatus, 277 "client_status", ar.Alloc().ClientStatus, 278 ) 279 return false 280 } 281 282 // It's possible that the alloc local state was marked terminal before 283 // the server copy of the alloc (checked above) was marked as terminal, 284 // so check the local state as well. 285 switch clientStatus := ar.AllocState().ClientStatus; clientStatus { 286 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost: 287 ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus) 288 return false 289 } 290 291 return true 292 } 293 294 // runTasks is used to run the task runners and block until they exit. 295 func (ar *allocRunner) runTasks() { 296 for _, task := range ar.tasks { 297 go task.Run() 298 } 299 300 for _, task := range ar.tasks { 301 <-task.WaitCh() 302 } 303 } 304 305 // Alloc returns the current allocation being run by this runner as sent by the 306 // server. This view of the allocation does not have updated task states. 307 func (ar *allocRunner) Alloc() *structs.Allocation { 308 ar.allocLock.RLock() 309 defer ar.allocLock.RUnlock() 310 return ar.alloc 311 } 312 313 func (ar *allocRunner) setAlloc(updated *structs.Allocation) { 314 ar.allocLock.Lock() 315 ar.alloc = updated 316 ar.allocLock.Unlock() 317 } 318 319 // GetAllocDir returns the alloc dir which is safe for concurrent use. 320 func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir { 321 return ar.allocDir 322 } 323 324 // Restore state from database. Must be called after NewAllocRunner but before 325 // Run. 326 func (ar *allocRunner) Restore() error { 327 // Retrieve deployment status to avoid reseting it across agent 328 // restarts. Once a deployment status is set Nomad no longer monitors 329 // alloc health, so we must persist deployment state across restarts. 330 ds, err := ar.stateDB.GetDeploymentStatus(ar.id) 331 if err != nil { 332 return err 333 } 334 335 ar.stateLock.Lock() 336 ar.state.DeploymentStatus = ds 337 ar.stateLock.Unlock() 338 339 // Restore task runners 340 for _, tr := range ar.tasks { 341 if err := tr.Restore(); err != nil { 342 return err 343 } 344 } 345 346 return nil 347 } 348 349 // persistDeploymentStatus stores AllocDeploymentStatus. 350 func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) { 351 if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil { 352 // While any persistence errors are very bad, the worst case 353 // scenario for failing to persist deployment status is that if 354 // the agent is restarted it will monitor the deployment status 355 // again. This could cause a deployment's status to change when 356 // that shouldn't happen. However, allowing that seems better 357 // than failing the entire allocation. 358 ar.logger.Error("error storing deployment status", "error", err) 359 } 360 } 361 362 // TaskStateUpdated is called by TaskRunner when a task's state has been 363 // updated. It does not process the update synchronously but instead notifies a 364 // goroutine the state has change. Since processing the state change may cause 365 // the task to be killed (thus change its state again) it cannot be done 366 // synchronously as it would cause a deadlock due to reentrancy. 367 // 368 // The goroutine is used to compute changes to the alloc's ClientStatus and to 369 // update the server with the new state. 370 func (ar *allocRunner) TaskStateUpdated() { 371 select { 372 case ar.taskStateUpdatedCh <- struct{}{}: 373 default: 374 // already pending updates 375 } 376 } 377 378 // handleTaskStateUpdates must be run in goroutine as it monitors 379 // taskStateUpdatedCh for task state update notifications and processes task 380 // states. 381 // 382 // Processing task state updates must be done in a goroutine as it may have to 383 // kill tasks which causes further task state updates. 384 func (ar *allocRunner) handleTaskStateUpdates() { 385 defer close(ar.taskStateUpdateHandlerCh) 386 387 for done := false; !done; { 388 select { 389 case <-ar.taskStateUpdatedCh: 390 case <-ar.waitCh: 391 // Run has exited, sync once more to ensure final 392 // states are collected. 393 done = true 394 } 395 396 ar.logger.Trace("handling task state update", "done", done) 397 398 // Set with the appropriate event if task runners should be 399 // killed. 400 var killEvent *structs.TaskEvent 401 402 // If task runners should be killed, this is set to the task 403 // name whose fault it is. 404 killTask := "" 405 406 // True if task runners should be killed because a leader 407 // failed (informational). 408 leaderFailed := false 409 410 // Task state has been updated; gather the state of the other tasks 411 trNum := len(ar.tasks) 412 liveRunners := make([]*taskrunner.TaskRunner, 0, trNum) 413 states := make(map[string]*structs.TaskState, trNum) 414 415 for name, tr := range ar.tasks { 416 state := tr.TaskState() 417 states[name] = state 418 419 // Capture live task runners in case we need to kill them 420 if state.State != structs.TaskStateDead { 421 liveRunners = append(liveRunners, tr) 422 continue 423 } 424 425 // Task is dead, determine if other tasks should be killed 426 if state.Failed { 427 // Only set failed event if no event has been 428 // set yet to give dead leaders priority. 429 if killEvent == nil { 430 killTask = name 431 killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed). 432 SetFailedSibling(name) 433 } 434 } else if tr.IsLeader() { 435 killEvent = structs.NewTaskEvent(structs.TaskLeaderDead) 436 leaderFailed = true 437 killTask = name 438 } 439 } 440 441 // If there's a kill event set and live runners, kill them 442 if killEvent != nil && len(liveRunners) > 0 { 443 444 // Log kill reason 445 if leaderFailed { 446 ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask) 447 } else { 448 ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask) 449 } 450 451 // Emit kill event for live runners 452 for _, tr := range liveRunners { 453 tr.EmitEvent(killEvent) 454 } 455 456 // Kill 'em all 457 states = ar.killTasks() 458 459 // Wait for TaskRunners to exit before continuing to 460 // prevent looping before TaskRunners have transitioned 461 // to Dead. 462 for _, tr := range liveRunners { 463 select { 464 case <-tr.WaitCh(): 465 case <-ar.waitCh: 466 } 467 } 468 } 469 470 // Get the client allocation 471 calloc := ar.clientAlloc(states) 472 473 // Update the server 474 ar.stateUpdater.AllocStateUpdated(calloc) 475 476 // Broadcast client alloc to listeners 477 ar.allocBroadcaster.Send(calloc) 478 } 479 } 480 481 // killTasks kills all task runners, leader (if there is one) first. Errors are 482 // logged except taskrunner.ErrTaskNotRunning which is ignored. Task states 483 // after Kill has been called are returned. 484 func (ar *allocRunner) killTasks() map[string]*structs.TaskState { 485 var mu sync.Mutex 486 states := make(map[string]*structs.TaskState, len(ar.tasks)) 487 488 // Kill leader first, synchronously 489 for name, tr := range ar.tasks { 490 if !tr.IsLeader() { 491 continue 492 } 493 494 err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilling)) 495 if err != nil && err != taskrunner.ErrTaskNotRunning { 496 ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) 497 } 498 499 state := tr.TaskState() 500 states[name] = state 501 break 502 } 503 504 // Kill the rest concurrently 505 wg := sync.WaitGroup{} 506 for name, tr := range ar.tasks { 507 if tr.IsLeader() { 508 continue 509 } 510 511 wg.Add(1) 512 go func(name string, tr *taskrunner.TaskRunner) { 513 defer wg.Done() 514 err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilling)) 515 if err != nil && err != taskrunner.ErrTaskNotRunning { 516 ar.logger.Warn("error stopping task", "error", err, "task_name", name) 517 } 518 519 state := tr.TaskState() 520 mu.Lock() 521 states[name] = state 522 mu.Unlock() 523 }(name, tr) 524 } 525 wg.Wait() 526 527 return states 528 } 529 530 // clientAlloc takes in the task states and returns an Allocation populated 531 // with Client specific fields 532 func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation { 533 ar.stateLock.Lock() 534 defer ar.stateLock.Unlock() 535 536 // store task states for AllocState to expose 537 ar.state.TaskStates = taskStates 538 539 a := &structs.Allocation{ 540 ID: ar.id, 541 TaskStates: taskStates, 542 } 543 544 if d := ar.state.DeploymentStatus; d != nil { 545 a.DeploymentStatus = d.Copy() 546 } 547 548 // Compute the ClientStatus 549 if ar.state.ClientStatus != "" { 550 // The client status is being forced 551 a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription 552 } else { 553 a.ClientStatus, a.ClientDescription = getClientStatus(taskStates) 554 } 555 556 // If the allocation is terminal, make sure all required fields are properly 557 // set. 558 if a.ClientTerminalStatus() { 559 alloc := ar.Alloc() 560 561 // If we are part of a deployment and the alloc has failed, mark the 562 // alloc as unhealthy. This guards against the watcher not be started. 563 // If the health status is already set then terminal allocations should not 564 if a.ClientStatus == structs.AllocClientStatusFailed && 565 alloc.DeploymentID != "" && !a.DeploymentStatus.HasHealth() { 566 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 567 Healthy: helper.BoolToPtr(false), 568 } 569 } 570 571 // Make sure we have marked the finished at for every task. This is used 572 // to calculate the reschedule time for failed allocations. 573 now := time.Now() 574 for _, task := range alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks { 575 ts, ok := a.TaskStates[task.Name] 576 if !ok { 577 ts = &structs.TaskState{} 578 a.TaskStates[task.Name] = ts 579 } 580 if ts.FinishedAt.IsZero() { 581 ts.FinishedAt = now 582 } 583 } 584 } 585 586 return a 587 } 588 589 // getClientStatus takes in the task states for a given allocation and computes 590 // the client status and description 591 func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) { 592 var pending, running, dead, failed bool 593 for _, state := range taskStates { 594 switch state.State { 595 case structs.TaskStateRunning: 596 running = true 597 case structs.TaskStatePending: 598 pending = true 599 case structs.TaskStateDead: 600 if state.Failed { 601 failed = true 602 } else { 603 dead = true 604 } 605 } 606 } 607 608 // Determine the alloc status 609 if failed { 610 return structs.AllocClientStatusFailed, "Failed tasks" 611 } else if running { 612 return structs.AllocClientStatusRunning, "Tasks are running" 613 } else if pending { 614 return structs.AllocClientStatusPending, "No tasks have started" 615 } else if dead { 616 return structs.AllocClientStatusComplete, "All tasks have completed" 617 } 618 619 return "", "" 620 } 621 622 // SetClientStatus is a helper for forcing a specific client 623 // status on the alloc runner. This is used during restore errors 624 // when the task state can't be restored. 625 func (ar *allocRunner) SetClientStatus(clientStatus string) { 626 ar.stateLock.Lock() 627 defer ar.stateLock.Unlock() 628 ar.state.ClientStatus = clientStatus 629 } 630 631 // AllocState returns a copy of allocation state including a snapshot of task 632 // states. 633 func (ar *allocRunner) AllocState() *state.State { 634 ar.stateLock.RLock() 635 state := ar.state.Copy() 636 ar.stateLock.RUnlock() 637 638 // If TaskStateUpdated has not been called yet, ar.state.TaskStates 639 // won't be set as it is not the canonical source of TaskStates. 640 if len(state.TaskStates) == 0 { 641 ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks)) 642 for k, tr := range ar.tasks { 643 state.TaskStates[k] = tr.TaskState() 644 } 645 } 646 647 // Generate alloc to get other state fields 648 alloc := ar.clientAlloc(state.TaskStates) 649 state.ClientStatus = alloc.ClientStatus 650 state.ClientDescription = alloc.ClientDescription 651 state.DeploymentStatus = alloc.DeploymentStatus 652 653 return state 654 } 655 656 // Update asyncronously updates the running allocation with a new version 657 // received from the server. 658 // When processing a new update, we will first attempt to drain stale updates 659 // from the queue, before appending the new one. 660 func (ar *allocRunner) Update(update *structs.Allocation) { 661 select { 662 // Drain queued update from the channel if possible, and check the modify 663 // index 664 case oldUpdate := <-ar.allocUpdatedCh: 665 // If the old update is newer than the replacement, then skip the new one 666 // and return. This case shouldn't happen, but may in the case of a bug 667 // elsewhere inside the system. 668 if oldUpdate.AllocModifyIndex > update.AllocModifyIndex { 669 ar.logger.Debug("Discarding allocation update due to newer alloc revision in queue", 670 "old_modify_index", oldUpdate.AllocModifyIndex, 671 "new_modify_index", update.AllocModifyIndex) 672 ar.allocUpdatedCh <- oldUpdate 673 return 674 } else { 675 ar.logger.Debug("Discarding allocation update", 676 "skipped_modify_index", oldUpdate.AllocModifyIndex, 677 "new_modify_index", update.AllocModifyIndex) 678 } 679 case <-ar.waitCh: 680 ar.logger.Trace("AllocRunner has terminated, skipping alloc update", 681 "modify_index", update.AllocModifyIndex) 682 return 683 default: 684 } 685 686 // Queue the new update 687 ar.allocUpdatedCh <- update 688 } 689 690 func (ar *allocRunner) handleAllocUpdates() { 691 for { 692 select { 693 case update := <-ar.allocUpdatedCh: 694 ar.handleAllocUpdate(update) 695 case <-ar.waitCh: 696 return 697 } 698 } 699 } 700 701 // This method sends the updated alloc to Run for serially processing updates. 702 // If there is already a pending update it will be discarded and replaced by 703 // the latest update. 704 func (ar *allocRunner) handleAllocUpdate(update *structs.Allocation) { 705 // Detect Stop updates 706 stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus() 707 708 // Update ar.alloc 709 ar.setAlloc(update) 710 711 // Run update hooks if not stopping or dead 712 if !update.TerminalStatus() { 713 if err := ar.update(update); err != nil { 714 ar.logger.Error("error running update hooks", "error", err) 715 } 716 717 } 718 719 // Update task runners 720 for _, tr := range ar.tasks { 721 tr.Update(update) 722 } 723 724 // If alloc is being terminated, kill all tasks, leader first 725 if stopping { 726 ar.killTasks() 727 } 728 729 } 730 731 func (ar *allocRunner) Listener() *cstructs.AllocListener { 732 return ar.allocBroadcaster.Listen() 733 } 734 735 func (ar *allocRunner) destroyImpl() { 736 // Stop any running tasks and persist states in case the client is 737 // shutdown before Destroy finishes. 738 states := ar.killTasks() 739 calloc := ar.clientAlloc(states) 740 ar.stateUpdater.AllocStateUpdated(calloc) 741 742 // Wait for tasks to exit and postrun hooks to finish 743 <-ar.waitCh 744 745 // Run destroy hooks 746 if err := ar.destroy(); err != nil { 747 ar.logger.Warn("error running destroy hooks", "error", err) 748 } 749 750 // Wait for task state update handler to exit before removing local 751 // state if Run() ran at all. 752 <-ar.taskStateUpdateHandlerCh 753 754 // Cleanup state db 755 if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil { 756 ar.logger.Warn("failed to delete allocation state", "error", err) 757 } 758 759 // Mark alloc as destroyed 760 ar.destroyedLock.Lock() 761 762 if !ar.shutdown { 763 ar.shutdown = true 764 close(ar.shutdownCh) 765 } 766 767 ar.destroyed = true 768 close(ar.destroyCh) 769 770 ar.destroyedLock.Unlock() 771 } 772 773 // Destroy the alloc runner by stopping it if it is still running and cleaning 774 // up all of its resources. 775 // 776 // This method is safe for calling concurrently with Run() and will cause it to 777 // exit (thus closing WaitCh). 778 // When the destroy action is completed, it will close DestroyCh(). 779 func (ar *allocRunner) Destroy() { 780 ar.destroyedLock.Lock() 781 defer ar.destroyedLock.Unlock() 782 783 if ar.destroyed { 784 // Only destroy once 785 return 786 } 787 788 if ar.destroyLaunched { 789 // Only dispatch a destroy once 790 return 791 } 792 793 ar.destroyLaunched = true 794 795 // Synchronize calls to shutdown/destroy 796 if ar.shutdownLaunched { 797 go func() { 798 ar.logger.Debug("Waiting for shutdown before destroying runner") 799 <-ar.shutdownCh 800 ar.destroyImpl() 801 }() 802 803 return 804 } 805 806 go ar.destroyImpl() 807 } 808 809 // IsDestroyed returns true if the alloc runner has been destroyed (stopped and 810 // garbage collected). 811 // 812 // This method is safe for calling concurrently with Run(). Callers must 813 // receive on WaitCh() to block until alloc runner has stopped and been 814 // destroyed. 815 func (ar *allocRunner) IsDestroyed() bool { 816 ar.destroyedLock.Lock() 817 defer ar.destroyedLock.Unlock() 818 return ar.destroyed 819 } 820 821 // IsWaiting returns true if the alloc runner is waiting for its previous 822 // allocation to terminate. 823 // 824 // This method is safe for calling concurrently with Run(). 825 func (ar *allocRunner) IsWaiting() bool { 826 return ar.prevAllocWatcher.IsWaiting() 827 } 828 829 // DestroyCh is a channel that is closed when an allocrunner is closed due to 830 // an explicit call to Destroy(). 831 func (ar *allocRunner) DestroyCh() <-chan struct{} { 832 return ar.destroyCh 833 } 834 835 // ShutdownCh is a channel that is closed when an allocrunner is closed due to 836 // either an explicit call to Shutdown(), or Destroy(). 837 func (ar *allocRunner) ShutdownCh() <-chan struct{} { 838 return ar.shutdownCh 839 } 840 841 // Shutdown AllocRunner gracefully. Asynchronously shuts down all TaskRunners. 842 // Tasks are unaffected and may be restored. 843 // When the destroy action is completed, it will close ShutdownCh(). 844 func (ar *allocRunner) Shutdown() { 845 ar.destroyedLock.Lock() 846 defer ar.destroyedLock.Unlock() 847 848 // Destroy is a superset of Shutdown so there's nothing to do if this 849 // has already been destroyed. 850 if ar.destroyed { 851 return 852 } 853 854 // Destroy is a superset of Shutdown so if it's been marked for destruction, 855 // don't try and shutdown in parallel. If shutdown has been launched, don't 856 // try again. 857 if ar.destroyLaunched || ar.shutdownLaunched { 858 return 859 } 860 861 ar.shutdownLaunched = true 862 863 go func() { 864 ar.logger.Trace("shutting down") 865 866 // Shutdown tasks gracefully if they were run 867 wg := sync.WaitGroup{} 868 for _, tr := range ar.tasks { 869 wg.Add(1) 870 go func(tr *taskrunner.TaskRunner) { 871 tr.Shutdown() 872 wg.Done() 873 }(tr) 874 } 875 wg.Wait() 876 877 // Wait for Run to exit 878 <-ar.waitCh 879 880 // Run shutdown hooks 881 ar.shutdownHooks() 882 883 // Wait for updater to finish its final run 884 <-ar.taskStateUpdateHandlerCh 885 886 ar.destroyedLock.Lock() 887 ar.shutdown = true 888 close(ar.shutdownCh) 889 ar.destroyedLock.Unlock() 890 }() 891 } 892 893 // IsMigrating returns true if the alloc runner is migrating data from its 894 // previous allocation. 895 // 896 // This method is safe for calling concurrently with Run(). 897 func (ar *allocRunner) IsMigrating() bool { 898 return ar.prevAllocMigrator.IsMigrating() 899 } 900 901 func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter { 902 return ar 903 } 904 905 // LatestAllocStats returns the latest stats for an allocation. If taskFilter 906 // is set, only stats for that task -- if it exists -- are returned. 907 func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) { 908 astat := &cstructs.AllocResourceUsage{ 909 Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)), 910 ResourceUsage: &cstructs.ResourceUsage{ 911 MemoryStats: &cstructs.MemoryStats{}, 912 CpuStats: &cstructs.CpuStats{}, 913 DeviceStats: []*device.DeviceGroupStats{}, 914 }, 915 } 916 917 for name, tr := range ar.tasks { 918 if taskFilter != "" && taskFilter != name { 919 // Getting stats for a particular task and its not this one! 920 continue 921 } 922 923 if usage := tr.LatestResourceUsage(); usage != nil { 924 astat.Tasks[name] = usage 925 astat.ResourceUsage.Add(usage.ResourceUsage) 926 if usage.Timestamp > astat.Timestamp { 927 astat.Timestamp = usage.Timestamp 928 } 929 } 930 } 931 932 return astat, nil 933 } 934 935 func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventHandler { 936 if tr, ok := ar.tasks[taskName]; ok { 937 return func(ev *drivers.TaskEvent) { 938 tr.EmitEvent(&structs.TaskEvent{ 939 Type: structs.TaskDriverMessage, 940 Time: ev.Timestamp.UnixNano(), 941 Details: ev.Annotations, 942 DriverMessage: ev.Message, 943 }) 944 } 945 } 946 return nil 947 } 948 949 // RestartTask signalls the task runner for the provided task to restart. 950 func (ar *allocRunner) RestartTask(taskName string, taskEvent *structs.TaskEvent) error { 951 tr, ok := ar.tasks[taskName] 952 if !ok { 953 return fmt.Errorf("Could not find task runner for task: %s", taskName) 954 } 955 956 return tr.Restart(context.TODO(), taskEvent, false) 957 } 958 959 // RestartAll signalls all task runners in the allocation to restart and passes 960 // a copy of the task event to each restart event. 961 // Returns any errors in a concatenated form. 962 func (ar *allocRunner) RestartAll(taskEvent *structs.TaskEvent) error { 963 var err *multierror.Error 964 965 for tn := range ar.tasks { 966 rerr := ar.RestartTask(tn, taskEvent.Copy()) 967 if rerr != nil { 968 err = multierror.Append(err, rerr) 969 } 970 } 971 972 return err.ErrorOrNil() 973 } 974 975 // Signal sends a signal request to task runners inside an allocation. If the 976 // taskName is empty, then it is sent to all tasks. 977 func (ar *allocRunner) Signal(taskName, signal string) error { 978 event := structs.NewTaskEvent(structs.TaskSignaling).SetSignalText(signal) 979 980 if taskName != "" { 981 tr, ok := ar.tasks[taskName] 982 if !ok { 983 return fmt.Errorf("Task not found") 984 } 985 986 return tr.Signal(event, signal) 987 } 988 989 var err *multierror.Error 990 991 for tn, tr := range ar.tasks { 992 rerr := tr.Signal(event.Copy(), signal) 993 if rerr != nil { 994 err = multierror.Append(err, fmt.Errorf("Failed to signal task: %s, err: %v", tn, rerr)) 995 } 996 } 997 998 return err.ErrorOrNil() 999 } 1000 1001 func (ar *allocRunner) GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler { 1002 tr, ok := ar.tasks[taskName] 1003 if !ok { 1004 return nil 1005 } 1006 1007 return tr.TaskExecHandler() 1008 } 1009 1010 func (ar *allocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) { 1011 tr, ok := ar.tasks[taskName] 1012 if !ok { 1013 return nil, fmt.Errorf("task not found") 1014 } 1015 1016 return tr.DriverCapabilities() 1017 }