github.com/manicqin/nomad@v0.9.5/client/allocrunner/alloc_runner.go (about) 1 package allocrunner 2 3 import ( 4 "context" 5 "fmt" 6 "path/filepath" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 multierror "github.com/hashicorp/go-multierror" 12 "github.com/hashicorp/nomad/client/allocdir" 13 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 14 "github.com/hashicorp/nomad/client/allocrunner/state" 15 "github.com/hashicorp/nomad/client/allocrunner/taskrunner" 16 "github.com/hashicorp/nomad/client/allocwatcher" 17 "github.com/hashicorp/nomad/client/config" 18 "github.com/hashicorp/nomad/client/consul" 19 "github.com/hashicorp/nomad/client/devicemanager" 20 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 21 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 22 cstate "github.com/hashicorp/nomad/client/state" 23 cstructs "github.com/hashicorp/nomad/client/structs" 24 "github.com/hashicorp/nomad/client/vaultclient" 25 agentconsul "github.com/hashicorp/nomad/command/agent/consul" 26 "github.com/hashicorp/nomad/helper" 27 "github.com/hashicorp/nomad/nomad/structs" 28 "github.com/hashicorp/nomad/plugins/device" 29 "github.com/hashicorp/nomad/plugins/drivers" 30 ) 31 32 // allocRunner is used to run all the tasks in a given allocation 33 type allocRunner struct { 34 // id is the ID of the allocation. Can be accessed without a lock 35 id string 36 37 // Logger is the logger for the alloc runner. 38 logger log.Logger 39 40 clientConfig *config.Config 41 42 // stateUpdater is used to emit updated alloc state 43 stateUpdater cinterfaces.AllocStateHandler 44 45 // taskStateUpdatedCh is ticked whenever task state as changed. Must 46 // have len==1 to allow nonblocking notification of state updates while 47 // the goroutine is already processing a previous update. 48 taskStateUpdatedCh chan struct{} 49 50 // taskStateUpdateHandlerCh is closed when the task state handling 51 // goroutine exits. It is unsafe to destroy the local allocation state 52 // before this goroutine exits. 53 taskStateUpdateHandlerCh chan struct{} 54 55 // allocUpdatedCh is a channel that is used to stream allocation updates into 56 // the allocUpdate handler. Must have len==1 to allow nonblocking notification 57 // of new allocation updates while the goroutine is processing a previous 58 // update. 59 allocUpdatedCh chan *structs.Allocation 60 61 // consulClient is the client used by the consul service hook for 62 // registering services and checks 63 consulClient consul.ConsulServiceAPI 64 65 // vaultClient is the used to manage Vault tokens 66 vaultClient vaultclient.VaultClient 67 68 // waitCh is closed when the Run loop has exited 69 waitCh chan struct{} 70 71 // destroyed is true when the Run loop has exited, postrun hooks have 72 // run, and alloc runner has been destroyed. Must acquire destroyedLock 73 // to access. 74 destroyed bool 75 76 // destroyCh is closed when the Run loop has exited, postrun hooks have 77 // run, and alloc runner has been destroyed. 78 destroyCh chan struct{} 79 80 // shutdown is true when the Run loop has exited, and shutdown hooks have 81 // run. Must acquire destroyedLock to access. 82 shutdown bool 83 84 // shutdownCh is closed when the Run loop has exited, and shutdown hooks 85 // have run. 86 shutdownCh chan struct{} 87 88 // destroyLaunched is true if Destroy has been called. Must acquire 89 // destroyedLock to access. 90 destroyLaunched bool 91 92 // shutdownLaunched is true if Shutdown has been called. Must acquire 93 // destroyedLock to access. 94 shutdownLaunched bool 95 96 // destroyedLock guards destroyed, destroyLaunched, shutdownLaunched, 97 // and serializes Shutdown/Destroy calls. 98 destroyedLock sync.Mutex 99 100 // Alloc captures the allocation being run. 101 alloc *structs.Allocation 102 allocLock sync.RWMutex 103 104 // state is the alloc runner's state 105 state *state.State 106 stateLock sync.RWMutex 107 108 stateDB cstate.StateDB 109 110 // allocDir is used to build the allocations directory structure. 111 allocDir *allocdir.AllocDir 112 113 // runnerHooks are alloc runner lifecycle hooks that should be run on state 114 // transistions. 115 runnerHooks []interfaces.RunnerHook 116 117 // tasks are the set of task runners 118 tasks map[string]*taskrunner.TaskRunner 119 120 // deviceStatsReporter is used to lookup resource usage for alloc devices 121 deviceStatsReporter cinterfaces.DeviceStatsReporter 122 123 // allocBroadcaster sends client allocation updates to all listeners 124 allocBroadcaster *cstructs.AllocBroadcaster 125 126 // prevAllocWatcher allows waiting for any previous or preempted allocations 127 // to exit 128 prevAllocWatcher allocwatcher.PrevAllocWatcher 129 130 // prevAllocMigrator allows the migration of a previous allocations alloc dir. 131 prevAllocMigrator allocwatcher.PrevAllocMigrator 132 133 // devicemanager is used to mount devices as well as lookup device 134 // statistics 135 devicemanager devicemanager.Manager 136 137 // driverManager is responsible for dispensing driver plugins and registering 138 // event handlers 139 driverManager drivermanager.Manager 140 141 // serversContactedCh is passed to TaskRunners so they can detect when 142 // servers have been contacted for the first time in case of a failed 143 // restore. 144 serversContactedCh chan struct{} 145 } 146 147 // NewAllocRunner returns a new allocation runner. 148 func NewAllocRunner(config *Config) (*allocRunner, error) { 149 alloc := config.Alloc 150 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 151 if tg == nil { 152 return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup) 153 } 154 155 ar := &allocRunner{ 156 id: alloc.ID, 157 alloc: alloc, 158 clientConfig: config.ClientConfig, 159 consulClient: config.Consul, 160 vaultClient: config.Vault, 161 tasks: make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)), 162 waitCh: make(chan struct{}), 163 destroyCh: make(chan struct{}), 164 shutdownCh: make(chan struct{}), 165 state: &state.State{}, 166 stateDB: config.StateDB, 167 stateUpdater: config.StateUpdater, 168 taskStateUpdatedCh: make(chan struct{}, 1), 169 taskStateUpdateHandlerCh: make(chan struct{}), 170 allocUpdatedCh: make(chan *structs.Allocation, 1), 171 deviceStatsReporter: config.DeviceStatsReporter, 172 prevAllocWatcher: config.PrevAllocWatcher, 173 prevAllocMigrator: config.PrevAllocMigrator, 174 devicemanager: config.DeviceManager, 175 driverManager: config.DriverManager, 176 serversContactedCh: config.ServersContactedCh, 177 } 178 179 // Create the logger based on the allocation ID 180 ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID) 181 182 // Create alloc broadcaster 183 ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger) 184 185 // Create alloc dir 186 ar.allocDir = allocdir.NewAllocDir(ar.logger, filepath.Join(config.ClientConfig.AllocDir, alloc.ID)) 187 188 // Initialize the runners hooks. 189 if err := ar.initRunnerHooks(config.ClientConfig); err != nil { 190 return nil, err 191 } 192 193 // Create the TaskRunners 194 if err := ar.initTaskRunners(tg.Tasks); err != nil { 195 return nil, err 196 } 197 198 return ar, nil 199 } 200 201 // initTaskRunners creates task runners but does *not* run them. 202 func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error { 203 for _, task := range tasks { 204 config := &taskrunner.Config{ 205 Alloc: ar.alloc, 206 ClientConfig: ar.clientConfig, 207 Task: task, 208 TaskDir: ar.allocDir.NewTaskDir(task.Name), 209 Logger: ar.logger, 210 StateDB: ar.stateDB, 211 StateUpdater: ar, 212 Consul: ar.consulClient, 213 Vault: ar.vaultClient, 214 DeviceStatsReporter: ar.deviceStatsReporter, 215 DeviceManager: ar.devicemanager, 216 DriverManager: ar.driverManager, 217 ServersContactedCh: ar.serversContactedCh, 218 } 219 220 // Create, but do not Run, the task runner 221 tr, err := taskrunner.NewTaskRunner(config) 222 if err != nil { 223 return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err) 224 } 225 226 ar.tasks[task.Name] = tr 227 } 228 return nil 229 } 230 231 func (ar *allocRunner) WaitCh() <-chan struct{} { 232 return ar.waitCh 233 } 234 235 // Run the AllocRunner. Starts tasks if the alloc is non-terminal and closes 236 // WaitCh when it exits. Should be started in a goroutine. 237 func (ar *allocRunner) Run() { 238 // Close the wait channel on return 239 defer close(ar.waitCh) 240 241 // Start the task state update handler 242 go ar.handleTaskStateUpdates() 243 244 // Start the alloc update handler 245 go ar.handleAllocUpdates() 246 247 // If task update chan has been closed, that means we've been shutdown. 248 select { 249 case <-ar.taskStateUpdateHandlerCh: 250 return 251 default: 252 } 253 254 // When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run 255 // to perform any cleanup that's necessary, potentially not done prior to earlier termination 256 257 // Run the prestart hooks if non-terminal 258 if ar.shouldRun() { 259 if err := ar.prerun(); err != nil { 260 ar.logger.Error("prerun failed", "error", err) 261 262 for _, tr := range ar.tasks { 263 tr.MarkFailedDead(fmt.Sprintf("failed to setup alloc: %v", err)) 264 } 265 266 goto POST 267 } 268 } 269 270 // Run the runners (blocks until they exit) 271 ar.runTasks() 272 273 POST: 274 if ar.isShuttingDown() { 275 return 276 } 277 278 // Run the postrun hooks 279 if err := ar.postrun(); err != nil { 280 ar.logger.Error("postrun failed", "error", err) 281 } 282 283 } 284 285 // shouldRun returns true if the alloc is in a state that the alloc runner 286 // should run it. 287 func (ar *allocRunner) shouldRun() bool { 288 // Do not run allocs that are terminal 289 if ar.Alloc().TerminalStatus() { 290 ar.logger.Trace("alloc terminal; not running", 291 "desired_status", ar.Alloc().DesiredStatus, 292 "client_status", ar.Alloc().ClientStatus, 293 ) 294 return false 295 } 296 297 // It's possible that the alloc local state was marked terminal before 298 // the server copy of the alloc (checked above) was marked as terminal, 299 // so check the local state as well. 300 switch clientStatus := ar.AllocState().ClientStatus; clientStatus { 301 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost: 302 ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus) 303 return false 304 } 305 306 return true 307 } 308 309 // runTasks is used to run the task runners and block until they exit. 310 func (ar *allocRunner) runTasks() { 311 for _, task := range ar.tasks { 312 go task.Run() 313 } 314 315 for _, task := range ar.tasks { 316 <-task.WaitCh() 317 } 318 } 319 320 // Alloc returns the current allocation being run by this runner as sent by the 321 // server. This view of the allocation does not have updated task states. 322 func (ar *allocRunner) Alloc() *structs.Allocation { 323 ar.allocLock.RLock() 324 defer ar.allocLock.RUnlock() 325 return ar.alloc 326 } 327 328 func (ar *allocRunner) setAlloc(updated *structs.Allocation) { 329 ar.allocLock.Lock() 330 ar.alloc = updated 331 ar.allocLock.Unlock() 332 } 333 334 // GetAllocDir returns the alloc dir which is safe for concurrent use. 335 func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir { 336 return ar.allocDir 337 } 338 339 // Restore state from database. Must be called after NewAllocRunner but before 340 // Run. 341 func (ar *allocRunner) Restore() error { 342 // Retrieve deployment status to avoid reseting it across agent 343 // restarts. Once a deployment status is set Nomad no longer monitors 344 // alloc health, so we must persist deployment state across restarts. 345 ds, err := ar.stateDB.GetDeploymentStatus(ar.id) 346 if err != nil { 347 return err 348 } 349 350 ar.stateLock.Lock() 351 ar.state.DeploymentStatus = ds 352 ar.stateLock.Unlock() 353 354 // Restore task runners 355 for _, tr := range ar.tasks { 356 if err := tr.Restore(); err != nil { 357 return err 358 } 359 } 360 361 return nil 362 } 363 364 // persistDeploymentStatus stores AllocDeploymentStatus. 365 func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) { 366 if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil { 367 // While any persistence errors are very bad, the worst case 368 // scenario for failing to persist deployment status is that if 369 // the agent is restarted it will monitor the deployment status 370 // again. This could cause a deployment's status to change when 371 // that shouldn't happen. However, allowing that seems better 372 // than failing the entire allocation. 373 ar.logger.Error("error storing deployment status", "error", err) 374 } 375 } 376 377 // TaskStateUpdated is called by TaskRunner when a task's state has been 378 // updated. It does not process the update synchronously but instead notifies a 379 // goroutine the state has change. Since processing the state change may cause 380 // the task to be killed (thus change its state again) it cannot be done 381 // synchronously as it would cause a deadlock due to reentrancy. 382 // 383 // The goroutine is used to compute changes to the alloc's ClientStatus and to 384 // update the server with the new state. 385 func (ar *allocRunner) TaskStateUpdated() { 386 select { 387 case ar.taskStateUpdatedCh <- struct{}{}: 388 default: 389 // already pending updates 390 } 391 } 392 393 // handleTaskStateUpdates must be run in goroutine as it monitors 394 // taskStateUpdatedCh for task state update notifications and processes task 395 // states. 396 // 397 // Processing task state updates must be done in a goroutine as it may have to 398 // kill tasks which causes further task state updates. 399 func (ar *allocRunner) handleTaskStateUpdates() { 400 defer close(ar.taskStateUpdateHandlerCh) 401 402 for done := false; !done; { 403 select { 404 case <-ar.taskStateUpdatedCh: 405 case <-ar.waitCh: 406 // Run has exited, sync once more to ensure final 407 // states are collected. 408 done = true 409 } 410 411 ar.logger.Trace("handling task state update", "done", done) 412 413 // Set with the appropriate event if task runners should be 414 // killed. 415 var killEvent *structs.TaskEvent 416 417 // If task runners should be killed, this is set to the task 418 // name whose fault it is. 419 killTask := "" 420 421 // True if task runners should be killed because a leader 422 // failed (informational). 423 leaderFailed := false 424 425 // Task state has been updated; gather the state of the other tasks 426 trNum := len(ar.tasks) 427 liveRunners := make([]*taskrunner.TaskRunner, 0, trNum) 428 states := make(map[string]*structs.TaskState, trNum) 429 430 for name, tr := range ar.tasks { 431 state := tr.TaskState() 432 states[name] = state 433 434 // Capture live task runners in case we need to kill them 435 if state.State != structs.TaskStateDead { 436 liveRunners = append(liveRunners, tr) 437 continue 438 } 439 440 // Task is dead, determine if other tasks should be killed 441 if state.Failed { 442 // Only set failed event if no event has been 443 // set yet to give dead leaders priority. 444 if killEvent == nil { 445 killTask = name 446 killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed). 447 SetFailedSibling(name) 448 } 449 } else if tr.IsLeader() { 450 killEvent = structs.NewTaskEvent(structs.TaskLeaderDead) 451 leaderFailed = true 452 killTask = name 453 } 454 } 455 456 // If there's a kill event set and live runners, kill them 457 if killEvent != nil && len(liveRunners) > 0 { 458 459 // Log kill reason 460 if leaderFailed { 461 ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask) 462 } else { 463 ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask) 464 } 465 466 // Emit kill event for live runners 467 for _, tr := range liveRunners { 468 tr.EmitEvent(killEvent) 469 } 470 471 // Kill 'em all 472 states = ar.killTasks() 473 474 // Wait for TaskRunners to exit before continuing to 475 // prevent looping before TaskRunners have transitioned 476 // to Dead. 477 for _, tr := range liveRunners { 478 select { 479 case <-tr.WaitCh(): 480 case <-ar.waitCh: 481 } 482 } 483 } 484 485 // Get the client allocation 486 calloc := ar.clientAlloc(states) 487 488 // Update the server 489 ar.stateUpdater.AllocStateUpdated(calloc) 490 491 // Broadcast client alloc to listeners 492 ar.allocBroadcaster.Send(calloc) 493 } 494 } 495 496 // killTasks kills all task runners, leader (if there is one) first. Errors are 497 // logged except taskrunner.ErrTaskNotRunning which is ignored. Task states 498 // after Kill has been called are returned. 499 func (ar *allocRunner) killTasks() map[string]*structs.TaskState { 500 var mu sync.Mutex 501 states := make(map[string]*structs.TaskState, len(ar.tasks)) 502 503 // run alloc prekill hooks 504 ar.preKillHooks() 505 506 // Kill leader first, synchronously 507 for name, tr := range ar.tasks { 508 if !tr.IsLeader() { 509 continue 510 } 511 512 taskEvent := structs.NewTaskEvent(structs.TaskKilling) 513 taskEvent.SetKillTimeout(tr.Task().KillTimeout) 514 err := tr.Kill(context.TODO(), taskEvent) 515 if err != nil && err != taskrunner.ErrTaskNotRunning { 516 ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) 517 } 518 519 state := tr.TaskState() 520 states[name] = state 521 break 522 } 523 524 // Kill the rest concurrently 525 wg := sync.WaitGroup{} 526 for name, tr := range ar.tasks { 527 if tr.IsLeader() { 528 continue 529 } 530 531 wg.Add(1) 532 go func(name string, tr *taskrunner.TaskRunner) { 533 defer wg.Done() 534 taskEvent := structs.NewTaskEvent(structs.TaskKilling) 535 taskEvent.SetKillTimeout(tr.Task().KillTimeout) 536 err := tr.Kill(context.TODO(), taskEvent) 537 if err != nil && err != taskrunner.ErrTaskNotRunning { 538 ar.logger.Warn("error stopping task", "error", err, "task_name", name) 539 } 540 541 state := tr.TaskState() 542 mu.Lock() 543 states[name] = state 544 mu.Unlock() 545 }(name, tr) 546 } 547 wg.Wait() 548 549 return states 550 } 551 552 // clientAlloc takes in the task states and returns an Allocation populated 553 // with Client specific fields 554 func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation { 555 ar.stateLock.Lock() 556 defer ar.stateLock.Unlock() 557 558 // store task states for AllocState to expose 559 ar.state.TaskStates = taskStates 560 561 a := &structs.Allocation{ 562 ID: ar.id, 563 TaskStates: taskStates, 564 } 565 566 if d := ar.state.DeploymentStatus; d != nil { 567 a.DeploymentStatus = d.Copy() 568 } 569 570 // Compute the ClientStatus 571 if ar.state.ClientStatus != "" { 572 // The client status is being forced 573 a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription 574 } else { 575 a.ClientStatus, a.ClientDescription = getClientStatus(taskStates) 576 } 577 578 // If the allocation is terminal, make sure all required fields are properly 579 // set. 580 if a.ClientTerminalStatus() { 581 alloc := ar.Alloc() 582 583 // If we are part of a deployment and the alloc has failed, mark the 584 // alloc as unhealthy. This guards against the watcher not be started. 585 // If the health status is already set then terminal allocations should not 586 if a.ClientStatus == structs.AllocClientStatusFailed && 587 alloc.DeploymentID != "" && !a.DeploymentStatus.HasHealth() { 588 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 589 Healthy: helper.BoolToPtr(false), 590 } 591 } 592 593 // Make sure we have marked the finished at for every task. This is used 594 // to calculate the reschedule time for failed allocations. 595 now := time.Now() 596 for taskName := range ar.tasks { 597 ts, ok := a.TaskStates[taskName] 598 if !ok { 599 ts = &structs.TaskState{} 600 a.TaskStates[taskName] = ts 601 } 602 if ts.FinishedAt.IsZero() { 603 ts.FinishedAt = now 604 } 605 } 606 } 607 608 return a 609 } 610 611 // getClientStatus takes in the task states for a given allocation and computes 612 // the client status and description 613 func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) { 614 var pending, running, dead, failed bool 615 for _, state := range taskStates { 616 switch state.State { 617 case structs.TaskStateRunning: 618 running = true 619 case structs.TaskStatePending: 620 pending = true 621 case structs.TaskStateDead: 622 if state.Failed { 623 failed = true 624 } else { 625 dead = true 626 } 627 } 628 } 629 630 // Determine the alloc status 631 if failed { 632 return structs.AllocClientStatusFailed, "Failed tasks" 633 } else if running { 634 return structs.AllocClientStatusRunning, "Tasks are running" 635 } else if pending { 636 return structs.AllocClientStatusPending, "No tasks have started" 637 } else if dead { 638 return structs.AllocClientStatusComplete, "All tasks have completed" 639 } 640 641 return "", "" 642 } 643 644 // SetClientStatus is a helper for forcing a specific client 645 // status on the alloc runner. This is used during restore errors 646 // when the task state can't be restored. 647 func (ar *allocRunner) SetClientStatus(clientStatus string) { 648 ar.stateLock.Lock() 649 defer ar.stateLock.Unlock() 650 ar.state.ClientStatus = clientStatus 651 } 652 653 // AllocState returns a copy of allocation state including a snapshot of task 654 // states. 655 func (ar *allocRunner) AllocState() *state.State { 656 ar.stateLock.RLock() 657 state := ar.state.Copy() 658 ar.stateLock.RUnlock() 659 660 // If TaskStateUpdated has not been called yet, ar.state.TaskStates 661 // won't be set as it is not the canonical source of TaskStates. 662 if len(state.TaskStates) == 0 { 663 ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks)) 664 for k, tr := range ar.tasks { 665 state.TaskStates[k] = tr.TaskState() 666 } 667 } 668 669 // Generate alloc to get other state fields 670 alloc := ar.clientAlloc(state.TaskStates) 671 state.ClientStatus = alloc.ClientStatus 672 state.ClientDescription = alloc.ClientDescription 673 state.DeploymentStatus = alloc.DeploymentStatus 674 675 return state 676 } 677 678 // Update asyncronously updates the running allocation with a new version 679 // received from the server. 680 // When processing a new update, we will first attempt to drain stale updates 681 // from the queue, before appending the new one. 682 func (ar *allocRunner) Update(update *structs.Allocation) { 683 select { 684 // Drain queued update from the channel if possible, and check the modify 685 // index 686 case oldUpdate := <-ar.allocUpdatedCh: 687 // If the old update is newer than the replacement, then skip the new one 688 // and return. This case shouldn't happen, but may in the case of a bug 689 // elsewhere inside the system. 690 if oldUpdate.AllocModifyIndex > update.AllocModifyIndex { 691 ar.logger.Debug("Discarding allocation update due to newer alloc revision in queue", 692 "old_modify_index", oldUpdate.AllocModifyIndex, 693 "new_modify_index", update.AllocModifyIndex) 694 ar.allocUpdatedCh <- oldUpdate 695 return 696 } else { 697 ar.logger.Debug("Discarding allocation update", 698 "skipped_modify_index", oldUpdate.AllocModifyIndex, 699 "new_modify_index", update.AllocModifyIndex) 700 } 701 case <-ar.waitCh: 702 ar.logger.Trace("AllocRunner has terminated, skipping alloc update", 703 "modify_index", update.AllocModifyIndex) 704 return 705 default: 706 } 707 708 // Queue the new update 709 ar.allocUpdatedCh <- update 710 } 711 712 func (ar *allocRunner) handleAllocUpdates() { 713 for { 714 select { 715 case update := <-ar.allocUpdatedCh: 716 ar.handleAllocUpdate(update) 717 case <-ar.waitCh: 718 return 719 } 720 } 721 } 722 723 // This method sends the updated alloc to Run for serially processing updates. 724 // If there is already a pending update it will be discarded and replaced by 725 // the latest update. 726 func (ar *allocRunner) handleAllocUpdate(update *structs.Allocation) { 727 // Detect Stop updates 728 stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus() 729 730 // Update ar.alloc 731 ar.setAlloc(update) 732 733 // Run update hooks if not stopping or dead 734 if !update.TerminalStatus() { 735 if err := ar.update(update); err != nil { 736 ar.logger.Error("error running update hooks", "error", err) 737 } 738 739 } 740 741 // Update task runners 742 for _, tr := range ar.tasks { 743 tr.Update(update) 744 } 745 746 // If alloc is being terminated, kill all tasks, leader first 747 if stopping { 748 ar.killTasks() 749 } 750 751 } 752 753 func (ar *allocRunner) Listener() *cstructs.AllocListener { 754 return ar.allocBroadcaster.Listen() 755 } 756 757 func (ar *allocRunner) destroyImpl() { 758 // Stop any running tasks and persist states in case the client is 759 // shutdown before Destroy finishes. 760 states := ar.killTasks() 761 calloc := ar.clientAlloc(states) 762 ar.stateUpdater.AllocStateUpdated(calloc) 763 764 // Wait for tasks to exit and postrun hooks to finish 765 <-ar.waitCh 766 767 // Run destroy hooks 768 if err := ar.destroy(); err != nil { 769 ar.logger.Warn("error running destroy hooks", "error", err) 770 } 771 772 // Wait for task state update handler to exit before removing local 773 // state if Run() ran at all. 774 <-ar.taskStateUpdateHandlerCh 775 776 // Mark alloc as destroyed 777 ar.destroyedLock.Lock() 778 779 // Cleanup state db; while holding the lock to avoid 780 // a race periodic PersistState that may resurrect the alloc 781 if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil { 782 ar.logger.Warn("failed to delete allocation state", "error", err) 783 } 784 785 if !ar.shutdown { 786 ar.shutdown = true 787 close(ar.shutdownCh) 788 } 789 790 ar.destroyed = true 791 close(ar.destroyCh) 792 793 ar.destroyedLock.Unlock() 794 } 795 796 func (ar *allocRunner) PersistState() error { 797 ar.destroyedLock.Lock() 798 defer ar.destroyedLock.Unlock() 799 800 if ar.destroyed { 801 err := ar.stateDB.DeleteAllocationBucket(ar.id) 802 if err != nil { 803 ar.logger.Warn("failed to delete allocation bucket", "error", err) 804 } 805 return nil 806 } 807 808 // TODO: consider persisting deployment state along with task status. 809 // While we study why only the alloc is persisted, I opted to maintain current 810 // behavior and not risk adding yet more IO calls unnecessarily. 811 return ar.stateDB.PutAllocation(ar.Alloc()) 812 } 813 814 // Destroy the alloc runner by stopping it if it is still running and cleaning 815 // up all of its resources. 816 // 817 // This method is safe for calling concurrently with Run() and will cause it to 818 // exit (thus closing WaitCh). 819 // When the destroy action is completed, it will close DestroyCh(). 820 func (ar *allocRunner) Destroy() { 821 ar.destroyedLock.Lock() 822 defer ar.destroyedLock.Unlock() 823 824 if ar.destroyed { 825 // Only destroy once 826 return 827 } 828 829 if ar.destroyLaunched { 830 // Only dispatch a destroy once 831 return 832 } 833 834 ar.destroyLaunched = true 835 836 // Synchronize calls to shutdown/destroy 837 if ar.shutdownLaunched { 838 go func() { 839 ar.logger.Debug("Waiting for shutdown before destroying runner") 840 <-ar.shutdownCh 841 ar.destroyImpl() 842 }() 843 844 return 845 } 846 847 go ar.destroyImpl() 848 } 849 850 // IsDestroyed returns true if the alloc runner has been destroyed (stopped and 851 // garbage collected). 852 // 853 // This method is safe for calling concurrently with Run(). Callers must 854 // receive on WaitCh() to block until alloc runner has stopped and been 855 // destroyed. 856 func (ar *allocRunner) IsDestroyed() bool { 857 ar.destroyedLock.Lock() 858 defer ar.destroyedLock.Unlock() 859 return ar.destroyed 860 } 861 862 // IsWaiting returns true if the alloc runner is waiting for its previous 863 // allocation to terminate. 864 // 865 // This method is safe for calling concurrently with Run(). 866 func (ar *allocRunner) IsWaiting() bool { 867 return ar.prevAllocWatcher.IsWaiting() 868 } 869 870 // isShuttingDown returns true if the alloc runner is in a shutdown state 871 // due to a call to Shutdown() or Destroy() 872 func (ar *allocRunner) isShuttingDown() bool { 873 ar.destroyedLock.Lock() 874 defer ar.destroyedLock.Unlock() 875 return ar.shutdownLaunched 876 } 877 878 // DestroyCh is a channel that is closed when an allocrunner is closed due to 879 // an explicit call to Destroy(). 880 func (ar *allocRunner) DestroyCh() <-chan struct{} { 881 return ar.destroyCh 882 } 883 884 // ShutdownCh is a channel that is closed when an allocrunner is closed due to 885 // either an explicit call to Shutdown(), or Destroy(). 886 func (ar *allocRunner) ShutdownCh() <-chan struct{} { 887 return ar.shutdownCh 888 } 889 890 // Shutdown AllocRunner gracefully. Asynchronously shuts down all TaskRunners. 891 // Tasks are unaffected and may be restored. 892 // When the destroy action is completed, it will close ShutdownCh(). 893 func (ar *allocRunner) Shutdown() { 894 ar.destroyedLock.Lock() 895 defer ar.destroyedLock.Unlock() 896 897 // Destroy is a superset of Shutdown so there's nothing to do if this 898 // has already been destroyed. 899 if ar.destroyed { 900 return 901 } 902 903 // Destroy is a superset of Shutdown so if it's been marked for destruction, 904 // don't try and shutdown in parallel. If shutdown has been launched, don't 905 // try again. 906 if ar.destroyLaunched || ar.shutdownLaunched { 907 return 908 } 909 910 ar.shutdownLaunched = true 911 912 go func() { 913 ar.logger.Trace("shutting down") 914 915 // Shutdown tasks gracefully if they were run 916 wg := sync.WaitGroup{} 917 for _, tr := range ar.tasks { 918 wg.Add(1) 919 go func(tr *taskrunner.TaskRunner) { 920 tr.Shutdown() 921 wg.Done() 922 }(tr) 923 } 924 wg.Wait() 925 926 // Wait for Run to exit 927 <-ar.waitCh 928 929 // Run shutdown hooks 930 ar.shutdownHooks() 931 932 // Wait for updater to finish its final run 933 <-ar.taskStateUpdateHandlerCh 934 935 ar.destroyedLock.Lock() 936 ar.shutdown = true 937 close(ar.shutdownCh) 938 ar.destroyedLock.Unlock() 939 }() 940 } 941 942 // IsMigrating returns true if the alloc runner is migrating data from its 943 // previous allocation. 944 // 945 // This method is safe for calling concurrently with Run(). 946 func (ar *allocRunner) IsMigrating() bool { 947 return ar.prevAllocMigrator.IsMigrating() 948 } 949 950 func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter { 951 return ar 952 } 953 954 // LatestAllocStats returns the latest stats for an allocation. If taskFilter 955 // is set, only stats for that task -- if it exists -- are returned. 956 func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) { 957 astat := &cstructs.AllocResourceUsage{ 958 Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)), 959 ResourceUsage: &cstructs.ResourceUsage{ 960 MemoryStats: &cstructs.MemoryStats{}, 961 CpuStats: &cstructs.CpuStats{}, 962 DeviceStats: []*device.DeviceGroupStats{}, 963 }, 964 } 965 966 for name, tr := range ar.tasks { 967 if taskFilter != "" && taskFilter != name { 968 // Getting stats for a particular task and its not this one! 969 continue 970 } 971 972 if usage := tr.LatestResourceUsage(); usage != nil { 973 astat.Tasks[name] = usage 974 astat.ResourceUsage.Add(usage.ResourceUsage) 975 if usage.Timestamp > astat.Timestamp { 976 astat.Timestamp = usage.Timestamp 977 } 978 } 979 } 980 981 return astat, nil 982 } 983 984 func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventHandler { 985 if tr, ok := ar.tasks[taskName]; ok { 986 return func(ev *drivers.TaskEvent) { 987 tr.EmitEvent(&structs.TaskEvent{ 988 Type: structs.TaskDriverMessage, 989 Time: ev.Timestamp.UnixNano(), 990 Details: ev.Annotations, 991 DriverMessage: ev.Message, 992 }) 993 } 994 } 995 return nil 996 } 997 998 // RestartTask signalls the task runner for the provided task to restart. 999 func (ar *allocRunner) RestartTask(taskName string, taskEvent *structs.TaskEvent) error { 1000 tr, ok := ar.tasks[taskName] 1001 if !ok { 1002 return fmt.Errorf("Could not find task runner for task: %s", taskName) 1003 } 1004 1005 return tr.Restart(context.TODO(), taskEvent, false) 1006 } 1007 1008 // Restart satisfies the WorkloadRestarter interface restarts all task runners 1009 // concurrently 1010 func (ar *allocRunner) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error { 1011 waitCh := make(chan struct{}) 1012 var err *multierror.Error 1013 var errMutex sync.Mutex 1014 1015 go func() { 1016 var wg sync.WaitGroup 1017 defer close(waitCh) 1018 for tn, tr := range ar.tasks { 1019 wg.Add(1) 1020 go func(taskName string, r agentconsul.WorkloadRestarter) { 1021 defer wg.Done() 1022 e := r.Restart(ctx, event, failure) 1023 if e != nil { 1024 errMutex.Lock() 1025 defer errMutex.Unlock() 1026 err = multierror.Append(err, fmt.Errorf("failed to restart task %s: %v", taskName, e)) 1027 } 1028 }(tn, tr) 1029 } 1030 wg.Wait() 1031 }() 1032 1033 select { 1034 case <-waitCh: 1035 case <-ctx.Done(): 1036 } 1037 1038 return err.ErrorOrNil() 1039 } 1040 1041 // RestartAll signalls all task runners in the allocation to restart and passes 1042 // a copy of the task event to each restart event. 1043 // Returns any errors in a concatenated form. 1044 func (ar *allocRunner) RestartAll(taskEvent *structs.TaskEvent) error { 1045 var err *multierror.Error 1046 1047 for tn := range ar.tasks { 1048 rerr := ar.RestartTask(tn, taskEvent.Copy()) 1049 if rerr != nil { 1050 err = multierror.Append(err, rerr) 1051 } 1052 } 1053 1054 return err.ErrorOrNil() 1055 } 1056 1057 // Signal sends a signal request to task runners inside an allocation. If the 1058 // taskName is empty, then it is sent to all tasks. 1059 func (ar *allocRunner) Signal(taskName, signal string) error { 1060 event := structs.NewTaskEvent(structs.TaskSignaling).SetSignalText(signal) 1061 1062 if taskName != "" { 1063 tr, ok := ar.tasks[taskName] 1064 if !ok { 1065 return fmt.Errorf("Task not found") 1066 } 1067 1068 return tr.Signal(event, signal) 1069 } 1070 1071 var err *multierror.Error 1072 1073 for tn, tr := range ar.tasks { 1074 rerr := tr.Signal(event.Copy(), signal) 1075 if rerr != nil { 1076 err = multierror.Append(err, fmt.Errorf("Failed to signal task: %s, err: %v", tn, rerr)) 1077 } 1078 } 1079 1080 return err.ErrorOrNil() 1081 } 1082 1083 func (ar *allocRunner) GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler { 1084 tr, ok := ar.tasks[taskName] 1085 if !ok { 1086 return nil 1087 } 1088 1089 return tr.TaskExecHandler() 1090 } 1091 1092 func (ar *allocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) { 1093 tr, ok := ar.tasks[taskName] 1094 if !ok { 1095 return nil, fmt.Errorf("task not found") 1096 } 1097 1098 return tr.DriverCapabilities() 1099 }