github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/alloc_runner.go (about) 1 package allocrunner 2 3 import ( 4 "context" 5 "fmt" 6 "path/filepath" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 multierror "github.com/hashicorp/go-multierror" 12 "github.com/hashicorp/nomad/client/allocdir" 13 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 14 "github.com/hashicorp/nomad/client/allocrunner/state" 15 "github.com/hashicorp/nomad/client/allocrunner/taskrunner" 16 "github.com/hashicorp/nomad/client/allocwatcher" 17 "github.com/hashicorp/nomad/client/config" 18 "github.com/hashicorp/nomad/client/consul" 19 "github.com/hashicorp/nomad/client/devicemanager" 20 "github.com/hashicorp/nomad/client/dynamicplugins" 21 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 22 "github.com/hashicorp/nomad/client/pluginmanager/csimanager" 23 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 24 cstate "github.com/hashicorp/nomad/client/state" 25 cstructs "github.com/hashicorp/nomad/client/structs" 26 "github.com/hashicorp/nomad/client/vaultclient" 27 agentconsul "github.com/hashicorp/nomad/command/agent/consul" 28 "github.com/hashicorp/nomad/helper" 29 "github.com/hashicorp/nomad/nomad/structs" 30 "github.com/hashicorp/nomad/plugins/device" 31 "github.com/hashicorp/nomad/plugins/drivers" 32 ) 33 34 // allocRunner is used to run all the tasks in a given allocation 35 type allocRunner struct { 36 // id is the ID of the allocation. Can be accessed without a lock 37 id string 38 39 // Logger is the logger for the alloc runner. 40 logger log.Logger 41 42 clientConfig *config.Config 43 44 // stateUpdater is used to emit updated alloc state 45 stateUpdater cinterfaces.AllocStateHandler 46 47 // taskStateUpdatedCh is ticked whenever task state as changed. Must 48 // have len==1 to allow nonblocking notification of state updates while 49 // the goroutine is already processing a previous update. 50 taskStateUpdatedCh chan struct{} 51 52 // taskStateUpdateHandlerCh is closed when the task state handling 53 // goroutine exits. It is unsafe to destroy the local allocation state 54 // before this goroutine exits. 55 taskStateUpdateHandlerCh chan struct{} 56 57 // allocUpdatedCh is a channel that is used to stream allocation updates into 58 // the allocUpdate handler. Must have len==1 to allow nonblocking notification 59 // of new allocation updates while the goroutine is processing a previous 60 // update. 61 allocUpdatedCh chan *structs.Allocation 62 63 // consulClient is the client used by the consul service hook for 64 // registering services and checks 65 consulClient consul.ConsulServiceAPI 66 67 // sidsClient is the client used by the service identity hook for 68 // managing SI tokens 69 sidsClient consul.ServiceIdentityAPI 70 71 // vaultClient is the used to manage Vault tokens 72 vaultClient vaultclient.VaultClient 73 74 // waitCh is closed when the Run loop has exited 75 waitCh chan struct{} 76 77 // destroyed is true when the Run loop has exited, postrun hooks have 78 // run, and alloc runner has been destroyed. Must acquire destroyedLock 79 // to access. 80 destroyed bool 81 82 // destroyCh is closed when the Run loop has exited, postrun hooks have 83 // run, and alloc runner has been destroyed. 84 destroyCh chan struct{} 85 86 // shutdown is true when the Run loop has exited, and shutdown hooks have 87 // run. Must acquire destroyedLock to access. 88 shutdown bool 89 90 // shutdownCh is closed when the Run loop has exited, and shutdown hooks 91 // have run. 92 shutdownCh chan struct{} 93 94 // destroyLaunched is true if Destroy has been called. Must acquire 95 // destroyedLock to access. 96 destroyLaunched bool 97 98 // shutdownLaunched is true if Shutdown has been called. Must acquire 99 // destroyedLock to access. 100 shutdownLaunched bool 101 102 // destroyedLock guards destroyed, destroyLaunched, shutdownLaunched, 103 // and serializes Shutdown/Destroy calls. 104 destroyedLock sync.Mutex 105 106 // Alloc captures the allocation being run. 107 alloc *structs.Allocation 108 allocLock sync.RWMutex 109 110 // state is the alloc runner's state 111 state *state.State 112 stateLock sync.RWMutex 113 114 stateDB cstate.StateDB 115 116 // allocDir is used to build the allocations directory structure. 117 allocDir *allocdir.AllocDir 118 119 // runnerHooks are alloc runner lifecycle hooks that should be run on state 120 // transistions. 121 runnerHooks []interfaces.RunnerHook 122 123 // hookState is the output of allocrunner hooks 124 hookState *cstructs.AllocHookResources 125 hookStateMu sync.RWMutex 126 127 // tasks are the set of task runners 128 tasks map[string]*taskrunner.TaskRunner 129 130 // deviceStatsReporter is used to lookup resource usage for alloc devices 131 deviceStatsReporter cinterfaces.DeviceStatsReporter 132 133 // allocBroadcaster sends client allocation updates to all listeners 134 allocBroadcaster *cstructs.AllocBroadcaster 135 136 // prevAllocWatcher allows waiting for any previous or preempted allocations 137 // to exit 138 prevAllocWatcher allocwatcher.PrevAllocWatcher 139 140 // prevAllocMigrator allows the migration of a previous allocations alloc dir. 141 prevAllocMigrator allocwatcher.PrevAllocMigrator 142 143 // dynamicRegistry contains all locally registered dynamic plugins (e.g csi 144 // plugins). 145 dynamicRegistry dynamicplugins.Registry 146 147 // csiManager is used to wait for CSI Volumes to be attached, and by the task 148 // runner to manage their mounting 149 csiManager csimanager.Manager 150 151 // devicemanager is used to mount devices as well as lookup device 152 // statistics 153 devicemanager devicemanager.Manager 154 155 // driverManager is responsible for dispensing driver plugins and registering 156 // event handlers 157 driverManager drivermanager.Manager 158 159 // serversContactedCh is passed to TaskRunners so they can detect when 160 // servers have been contacted for the first time in case of a failed 161 // restore. 162 serversContactedCh chan struct{} 163 164 taskHookCoordinator *taskHookCoordinator 165 166 // rpcClient is the RPC Client that should be used by the allocrunner and its 167 // hooks to communicate with Nomad Servers. 168 rpcClient RPCer 169 } 170 171 // RPCer is the interface needed by hooks to make RPC calls. 172 type RPCer interface { 173 RPC(method string, args interface{}, reply interface{}) error 174 } 175 176 // NewAllocRunner returns a new allocation runner. 177 func NewAllocRunner(config *Config) (*allocRunner, error) { 178 alloc := config.Alloc 179 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 180 if tg == nil { 181 return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup) 182 } 183 184 ar := &allocRunner{ 185 id: alloc.ID, 186 alloc: alloc, 187 clientConfig: config.ClientConfig, 188 consulClient: config.Consul, 189 sidsClient: config.ConsulSI, 190 vaultClient: config.Vault, 191 tasks: make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)), 192 waitCh: make(chan struct{}), 193 destroyCh: make(chan struct{}), 194 shutdownCh: make(chan struct{}), 195 state: &state.State{}, 196 stateDB: config.StateDB, 197 stateUpdater: config.StateUpdater, 198 taskStateUpdatedCh: make(chan struct{}, 1), 199 taskStateUpdateHandlerCh: make(chan struct{}), 200 allocUpdatedCh: make(chan *structs.Allocation, 1), 201 deviceStatsReporter: config.DeviceStatsReporter, 202 prevAllocWatcher: config.PrevAllocWatcher, 203 prevAllocMigrator: config.PrevAllocMigrator, 204 dynamicRegistry: config.DynamicRegistry, 205 csiManager: config.CSIManager, 206 devicemanager: config.DeviceManager, 207 driverManager: config.DriverManager, 208 serversContactedCh: config.ServersContactedCh, 209 rpcClient: config.RPCClient, 210 } 211 212 // Create the logger based on the allocation ID 213 ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID) 214 215 // Create alloc broadcaster 216 ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger) 217 218 // Create alloc dir 219 ar.allocDir = allocdir.NewAllocDir(ar.logger, filepath.Join(config.ClientConfig.AllocDir, alloc.ID)) 220 221 ar.taskHookCoordinator = newTaskHookCoordinator(ar.logger, tg.Tasks) 222 223 // Initialize the runners hooks. 224 if err := ar.initRunnerHooks(config.ClientConfig); err != nil { 225 return nil, err 226 } 227 228 // Create the TaskRunners 229 if err := ar.initTaskRunners(tg.Tasks); err != nil { 230 return nil, err 231 } 232 233 return ar, nil 234 } 235 236 // initTaskRunners creates task runners but does *not* run them. 237 func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error { 238 for _, task := range tasks { 239 config := &taskrunner.Config{ 240 Alloc: ar.alloc, 241 ClientConfig: ar.clientConfig, 242 Task: task, 243 TaskDir: ar.allocDir.NewTaskDir(task.Name), 244 Logger: ar.logger, 245 StateDB: ar.stateDB, 246 StateUpdater: ar, 247 DynamicRegistry: ar.dynamicRegistry, 248 Consul: ar.consulClient, 249 ConsulSI: ar.sidsClient, 250 Vault: ar.vaultClient, 251 DeviceStatsReporter: ar.deviceStatsReporter, 252 CSIManager: ar.csiManager, 253 DeviceManager: ar.devicemanager, 254 DriverManager: ar.driverManager, 255 ServersContactedCh: ar.serversContactedCh, 256 StartConditionMetCtx: ar.taskHookCoordinator.startConditionForTask(task), 257 } 258 259 // Create, but do not Run, the task runner 260 tr, err := taskrunner.NewTaskRunner(config) 261 if err != nil { 262 return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err) 263 } 264 265 ar.tasks[task.Name] = tr 266 } 267 return nil 268 } 269 270 func (ar *allocRunner) WaitCh() <-chan struct{} { 271 return ar.waitCh 272 } 273 274 // Run the AllocRunner. Starts tasks if the alloc is non-terminal and closes 275 // WaitCh when it exits. Should be started in a goroutine. 276 func (ar *allocRunner) Run() { 277 // Close the wait channel on return 278 defer close(ar.waitCh) 279 280 // Start the task state update handler 281 go ar.handleTaskStateUpdates() 282 283 // Start the alloc update handler 284 go ar.handleAllocUpdates() 285 286 // If task update chan has been closed, that means we've been shutdown. 287 select { 288 case <-ar.taskStateUpdateHandlerCh: 289 return 290 default: 291 } 292 293 // When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run 294 // to perform any cleanup that's necessary, potentially not done prior to earlier termination 295 296 // Run the prestart hooks if non-terminal 297 if ar.shouldRun() { 298 if err := ar.prerun(); err != nil { 299 ar.logger.Error("prerun failed", "error", err) 300 301 for _, tr := range ar.tasks { 302 tr.MarkFailedDead(fmt.Sprintf("failed to setup alloc: %v", err)) 303 } 304 305 goto POST 306 } 307 } 308 309 // Run the runners (blocks until they exit) 310 ar.runTasks() 311 312 POST: 313 if ar.isShuttingDown() { 314 return 315 } 316 317 // Run the postrun hooks 318 if err := ar.postrun(); err != nil { 319 ar.logger.Error("postrun failed", "error", err) 320 } 321 322 } 323 324 // shouldRun returns true if the alloc is in a state that the alloc runner 325 // should run it. 326 func (ar *allocRunner) shouldRun() bool { 327 // Do not run allocs that are terminal 328 if ar.Alloc().TerminalStatus() { 329 ar.logger.Trace("alloc terminal; not running", 330 "desired_status", ar.Alloc().DesiredStatus, 331 "client_status", ar.Alloc().ClientStatus, 332 ) 333 return false 334 } 335 336 // It's possible that the alloc local state was marked terminal before 337 // the server copy of the alloc (checked above) was marked as terminal, 338 // so check the local state as well. 339 switch clientStatus := ar.AllocState().ClientStatus; clientStatus { 340 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost: 341 ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus) 342 return false 343 } 344 345 return true 346 } 347 348 // runTasks is used to run the task runners and block until they exit. 349 func (ar *allocRunner) runTasks() { 350 for _, task := range ar.tasks { 351 go task.Run() 352 } 353 354 for _, task := range ar.tasks { 355 <-task.WaitCh() 356 } 357 } 358 359 // Alloc returns the current allocation being run by this runner as sent by the 360 // server. This view of the allocation does not have updated task states. 361 func (ar *allocRunner) Alloc() *structs.Allocation { 362 ar.allocLock.RLock() 363 defer ar.allocLock.RUnlock() 364 return ar.alloc 365 } 366 367 func (ar *allocRunner) setAlloc(updated *structs.Allocation) { 368 ar.allocLock.Lock() 369 ar.alloc = updated 370 ar.allocLock.Unlock() 371 } 372 373 // GetAllocDir returns the alloc dir which is safe for concurrent use. 374 func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir { 375 return ar.allocDir 376 } 377 378 // Restore state from database. Must be called after NewAllocRunner but before 379 // Run. 380 func (ar *allocRunner) Restore() error { 381 // Retrieve deployment status to avoid reseting it across agent 382 // restarts. Once a deployment status is set Nomad no longer monitors 383 // alloc health, so we must persist deployment state across restarts. 384 ds, err := ar.stateDB.GetDeploymentStatus(ar.id) 385 if err != nil { 386 return err 387 } 388 389 ar.stateLock.Lock() 390 ar.state.DeploymentStatus = ds 391 ar.stateLock.Unlock() 392 393 states := make(map[string]*structs.TaskState) 394 395 // Restore task runners 396 for _, tr := range ar.tasks { 397 if err := tr.Restore(); err != nil { 398 return err 399 } 400 states[tr.Task().Name] = tr.TaskState() 401 } 402 403 ar.taskHookCoordinator.taskStateUpdated(states) 404 405 return nil 406 } 407 408 // persistDeploymentStatus stores AllocDeploymentStatus. 409 func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) { 410 if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil { 411 // While any persistence errors are very bad, the worst case 412 // scenario for failing to persist deployment status is that if 413 // the agent is restarted it will monitor the deployment status 414 // again. This could cause a deployment's status to change when 415 // that shouldn't happen. However, allowing that seems better 416 // than failing the entire allocation. 417 ar.logger.Error("error storing deployment status", "error", err) 418 } 419 } 420 421 // TaskStateUpdated is called by TaskRunner when a task's state has been 422 // updated. It does not process the update synchronously but instead notifies a 423 // goroutine the state has change. Since processing the state change may cause 424 // the task to be killed (thus change its state again) it cannot be done 425 // synchronously as it would cause a deadlock due to reentrancy. 426 // 427 // The goroutine is used to compute changes to the alloc's ClientStatus and to 428 // update the server with the new state. 429 func (ar *allocRunner) TaskStateUpdated() { 430 select { 431 case ar.taskStateUpdatedCh <- struct{}{}: 432 default: 433 // already pending updates 434 } 435 } 436 437 // handleTaskStateUpdates must be run in goroutine as it monitors 438 // taskStateUpdatedCh for task state update notifications and processes task 439 // states. 440 // 441 // Processing task state updates must be done in a goroutine as it may have to 442 // kill tasks which causes further task state updates. 443 func (ar *allocRunner) handleTaskStateUpdates() { 444 defer close(ar.taskStateUpdateHandlerCh) 445 446 for done := false; !done; { 447 select { 448 case <-ar.taskStateUpdatedCh: 449 case <-ar.waitCh: 450 // Run has exited, sync once more to ensure final 451 // states are collected. 452 done = true 453 } 454 455 ar.logger.Trace("handling task state update", "done", done) 456 457 // Set with the appropriate event if task runners should be 458 // killed. 459 var killEvent *structs.TaskEvent 460 461 // If task runners should be killed, this is set to the task 462 // name whose fault it is. 463 killTask := "" 464 465 // True if task runners should be killed because a leader 466 // failed (informational). 467 leaderFailed := false 468 469 // Task state has been updated; gather the state of the other tasks 470 trNum := len(ar.tasks) 471 liveRunners := make([]*taskrunner.TaskRunner, 0, trNum) 472 states := make(map[string]*structs.TaskState, trNum) 473 474 for name, tr := range ar.tasks { 475 state := tr.TaskState() 476 states[name] = state 477 478 // Capture live task runners in case we need to kill them 479 if state.State != structs.TaskStateDead { 480 liveRunners = append(liveRunners, tr) 481 continue 482 } 483 484 // Task is dead, determine if other tasks should be killed 485 if state.Failed { 486 // Only set failed event if no event has been 487 // set yet to give dead leaders priority. 488 if killEvent == nil { 489 killTask = name 490 killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed). 491 SetFailedSibling(name) 492 } 493 } else if tr.IsLeader() { 494 killEvent = structs.NewTaskEvent(structs.TaskLeaderDead) 495 leaderFailed = true 496 killTask = name 497 } 498 } 499 500 // If there's a kill event set and live runners, kill them 501 if killEvent != nil && len(liveRunners) > 0 { 502 503 // Log kill reason 504 if leaderFailed { 505 ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask) 506 } else { 507 ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask) 508 } 509 510 // Emit kill event for live runners 511 for _, tr := range liveRunners { 512 tr.EmitEvent(killEvent) 513 } 514 515 // Kill 'em all 516 states = ar.killTasks() 517 518 // Wait for TaskRunners to exit before continuing to 519 // prevent looping before TaskRunners have transitioned 520 // to Dead. 521 for _, tr := range liveRunners { 522 select { 523 case <-tr.WaitCh(): 524 case <-ar.waitCh: 525 } 526 } 527 } 528 529 ar.taskHookCoordinator.taskStateUpdated(states) 530 531 // Get the client allocation 532 calloc := ar.clientAlloc(states) 533 534 // Update the server 535 ar.stateUpdater.AllocStateUpdated(calloc) 536 537 // Broadcast client alloc to listeners 538 ar.allocBroadcaster.Send(calloc) 539 } 540 } 541 542 // killTasks kills all task runners, leader (if there is one) first. Errors are 543 // logged except taskrunner.ErrTaskNotRunning which is ignored. Task states 544 // after Kill has been called are returned. 545 func (ar *allocRunner) killTasks() map[string]*structs.TaskState { 546 var mu sync.Mutex 547 states := make(map[string]*structs.TaskState, len(ar.tasks)) 548 549 // run alloc prekill hooks 550 ar.preKillHooks() 551 552 // Kill leader first, synchronously 553 for name, tr := range ar.tasks { 554 if !tr.IsLeader() { 555 continue 556 } 557 558 taskEvent := structs.NewTaskEvent(structs.TaskKilling) 559 taskEvent.SetKillTimeout(tr.Task().KillTimeout) 560 err := tr.Kill(context.TODO(), taskEvent) 561 if err != nil && err != taskrunner.ErrTaskNotRunning { 562 ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) 563 } 564 565 state := tr.TaskState() 566 states[name] = state 567 break 568 } 569 570 // Kill the rest concurrently 571 wg := sync.WaitGroup{} 572 for name, tr := range ar.tasks { 573 if tr.IsLeader() { 574 continue 575 } 576 577 wg.Add(1) 578 go func(name string, tr *taskrunner.TaskRunner) { 579 defer wg.Done() 580 taskEvent := structs.NewTaskEvent(structs.TaskKilling) 581 taskEvent.SetKillTimeout(tr.Task().KillTimeout) 582 err := tr.Kill(context.TODO(), taskEvent) 583 if err != nil && err != taskrunner.ErrTaskNotRunning { 584 ar.logger.Warn("error stopping task", "error", err, "task_name", name) 585 } 586 587 state := tr.TaskState() 588 mu.Lock() 589 states[name] = state 590 mu.Unlock() 591 }(name, tr) 592 } 593 wg.Wait() 594 595 return states 596 } 597 598 // clientAlloc takes in the task states and returns an Allocation populated 599 // with Client specific fields 600 func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation { 601 ar.stateLock.Lock() 602 defer ar.stateLock.Unlock() 603 604 // store task states for AllocState to expose 605 ar.state.TaskStates = taskStates 606 607 a := &structs.Allocation{ 608 ID: ar.id, 609 TaskStates: taskStates, 610 } 611 612 if d := ar.state.DeploymentStatus; d != nil { 613 a.DeploymentStatus = d.Copy() 614 } 615 616 // Compute the ClientStatus 617 if ar.state.ClientStatus != "" { 618 // The client status is being forced 619 a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription 620 } else { 621 a.ClientStatus, a.ClientDescription = getClientStatus(taskStates) 622 } 623 624 // If the allocation is terminal, make sure all required fields are properly 625 // set. 626 if a.ClientTerminalStatus() { 627 alloc := ar.Alloc() 628 629 // If we are part of a deployment and the alloc has failed, mark the 630 // alloc as unhealthy. This guards against the watcher not be started. 631 // If the health status is already set then terminal allocations should not 632 if a.ClientStatus == structs.AllocClientStatusFailed && 633 alloc.DeploymentID != "" && !a.DeploymentStatus.HasHealth() { 634 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 635 Healthy: helper.BoolToPtr(false), 636 } 637 } 638 639 // Make sure we have marked the finished at for every task. This is used 640 // to calculate the reschedule time for failed allocations. 641 now := time.Now() 642 for taskName := range ar.tasks { 643 ts, ok := a.TaskStates[taskName] 644 if !ok { 645 ts = &structs.TaskState{} 646 a.TaskStates[taskName] = ts 647 } 648 if ts.FinishedAt.IsZero() { 649 ts.FinishedAt = now 650 } 651 } 652 } 653 654 return a 655 } 656 657 // getClientStatus takes in the task states for a given allocation and computes 658 // the client status and description 659 func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) { 660 var pending, running, dead, failed bool 661 for _, state := range taskStates { 662 switch state.State { 663 case structs.TaskStateRunning: 664 running = true 665 case structs.TaskStatePending: 666 pending = true 667 case structs.TaskStateDead: 668 if state.Failed { 669 failed = true 670 } else { 671 dead = true 672 } 673 } 674 } 675 676 // Determine the alloc status 677 if failed { 678 return structs.AllocClientStatusFailed, "Failed tasks" 679 } else if running { 680 return structs.AllocClientStatusRunning, "Tasks are running" 681 } else if pending { 682 return structs.AllocClientStatusPending, "No tasks have started" 683 } else if dead { 684 return structs.AllocClientStatusComplete, "All tasks have completed" 685 } 686 687 return "", "" 688 } 689 690 // SetClientStatus is a helper for forcing a specific client 691 // status on the alloc runner. This is used during restore errors 692 // when the task state can't be restored. 693 func (ar *allocRunner) SetClientStatus(clientStatus string) { 694 ar.stateLock.Lock() 695 defer ar.stateLock.Unlock() 696 ar.state.ClientStatus = clientStatus 697 } 698 699 // AllocState returns a copy of allocation state including a snapshot of task 700 // states. 701 func (ar *allocRunner) AllocState() *state.State { 702 ar.stateLock.RLock() 703 state := ar.state.Copy() 704 ar.stateLock.RUnlock() 705 706 // If TaskStateUpdated has not been called yet, ar.state.TaskStates 707 // won't be set as it is not the canonical source of TaskStates. 708 if len(state.TaskStates) == 0 { 709 ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks)) 710 for k, tr := range ar.tasks { 711 state.TaskStates[k] = tr.TaskState() 712 } 713 } 714 715 // Generate alloc to get other state fields 716 alloc := ar.clientAlloc(state.TaskStates) 717 state.ClientStatus = alloc.ClientStatus 718 state.ClientDescription = alloc.ClientDescription 719 state.DeploymentStatus = alloc.DeploymentStatus 720 721 return state 722 } 723 724 // Update asyncronously updates the running allocation with a new version 725 // received from the server. 726 // When processing a new update, we will first attempt to drain stale updates 727 // from the queue, before appending the new one. 728 func (ar *allocRunner) Update(update *structs.Allocation) { 729 select { 730 // Drain queued update from the channel if possible, and check the modify 731 // index 732 case oldUpdate := <-ar.allocUpdatedCh: 733 // If the old update is newer than the replacement, then skip the new one 734 // and return. This case shouldn't happen, but may in the case of a bug 735 // elsewhere inside the system. 736 if oldUpdate.AllocModifyIndex > update.AllocModifyIndex { 737 ar.logger.Debug("Discarding allocation update due to newer alloc revision in queue", 738 "old_modify_index", oldUpdate.AllocModifyIndex, 739 "new_modify_index", update.AllocModifyIndex) 740 ar.allocUpdatedCh <- oldUpdate 741 return 742 } else { 743 ar.logger.Debug("Discarding allocation update", 744 "skipped_modify_index", oldUpdate.AllocModifyIndex, 745 "new_modify_index", update.AllocModifyIndex) 746 } 747 case <-ar.waitCh: 748 ar.logger.Trace("AllocRunner has terminated, skipping alloc update", 749 "modify_index", update.AllocModifyIndex) 750 return 751 default: 752 } 753 754 // Queue the new update 755 ar.allocUpdatedCh <- update 756 } 757 758 func (ar *allocRunner) handleAllocUpdates() { 759 for { 760 select { 761 case update := <-ar.allocUpdatedCh: 762 ar.handleAllocUpdate(update) 763 case <-ar.waitCh: 764 return 765 } 766 } 767 } 768 769 // This method sends the updated alloc to Run for serially processing updates. 770 // If there is already a pending update it will be discarded and replaced by 771 // the latest update. 772 func (ar *allocRunner) handleAllocUpdate(update *structs.Allocation) { 773 // Detect Stop updates 774 stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus() 775 776 // Update ar.alloc 777 ar.setAlloc(update) 778 779 // Run update hooks if not stopping or dead 780 if !update.TerminalStatus() { 781 if err := ar.update(update); err != nil { 782 ar.logger.Error("error running update hooks", "error", err) 783 } 784 785 } 786 787 // Update task runners 788 for _, tr := range ar.tasks { 789 tr.Update(update) 790 } 791 792 // If alloc is being terminated, kill all tasks, leader first 793 if stopping { 794 ar.killTasks() 795 } 796 797 } 798 799 func (ar *allocRunner) Listener() *cstructs.AllocListener { 800 return ar.allocBroadcaster.Listen() 801 } 802 803 func (ar *allocRunner) destroyImpl() { 804 // Stop any running tasks and persist states in case the client is 805 // shutdown before Destroy finishes. 806 states := ar.killTasks() 807 calloc := ar.clientAlloc(states) 808 ar.stateUpdater.AllocStateUpdated(calloc) 809 810 // Wait for tasks to exit and postrun hooks to finish 811 <-ar.waitCh 812 813 // Run destroy hooks 814 if err := ar.destroy(); err != nil { 815 ar.logger.Warn("error running destroy hooks", "error", err) 816 } 817 818 // Wait for task state update handler to exit before removing local 819 // state if Run() ran at all. 820 <-ar.taskStateUpdateHandlerCh 821 822 // Mark alloc as destroyed 823 ar.destroyedLock.Lock() 824 825 // Cleanup state db; while holding the lock to avoid 826 // a race periodic PersistState that may resurrect the alloc 827 if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil { 828 ar.logger.Warn("failed to delete allocation state", "error", err) 829 } 830 831 if !ar.shutdown { 832 ar.shutdown = true 833 close(ar.shutdownCh) 834 } 835 836 ar.destroyed = true 837 close(ar.destroyCh) 838 839 ar.destroyedLock.Unlock() 840 } 841 842 func (ar *allocRunner) PersistState() error { 843 ar.destroyedLock.Lock() 844 defer ar.destroyedLock.Unlock() 845 846 if ar.destroyed { 847 err := ar.stateDB.DeleteAllocationBucket(ar.id) 848 if err != nil { 849 ar.logger.Warn("failed to delete allocation bucket", "error", err) 850 } 851 return nil 852 } 853 854 // TODO: consider persisting deployment state along with task status. 855 // While we study why only the alloc is persisted, I opted to maintain current 856 // behavior and not risk adding yet more IO calls unnecessarily. 857 return ar.stateDB.PutAllocation(ar.Alloc()) 858 } 859 860 // Destroy the alloc runner by stopping it if it is still running and cleaning 861 // up all of its resources. 862 // 863 // This method is safe for calling concurrently with Run() and will cause it to 864 // exit (thus closing WaitCh). 865 // When the destroy action is completed, it will close DestroyCh(). 866 func (ar *allocRunner) Destroy() { 867 ar.destroyedLock.Lock() 868 defer ar.destroyedLock.Unlock() 869 870 if ar.destroyed { 871 // Only destroy once 872 return 873 } 874 875 if ar.destroyLaunched { 876 // Only dispatch a destroy once 877 return 878 } 879 880 ar.destroyLaunched = true 881 882 // Synchronize calls to shutdown/destroy 883 if ar.shutdownLaunched { 884 go func() { 885 ar.logger.Debug("Waiting for shutdown before destroying runner") 886 <-ar.shutdownCh 887 ar.destroyImpl() 888 }() 889 890 return 891 } 892 893 go ar.destroyImpl() 894 } 895 896 // IsDestroyed returns true if the alloc runner has been destroyed (stopped and 897 // garbage collected). 898 // 899 // This method is safe for calling concurrently with Run(). Callers must 900 // receive on WaitCh() to block until alloc runner has stopped and been 901 // destroyed. 902 func (ar *allocRunner) IsDestroyed() bool { 903 ar.destroyedLock.Lock() 904 defer ar.destroyedLock.Unlock() 905 return ar.destroyed 906 } 907 908 // IsWaiting returns true if the alloc runner is waiting for its previous 909 // allocation to terminate. 910 // 911 // This method is safe for calling concurrently with Run(). 912 func (ar *allocRunner) IsWaiting() bool { 913 return ar.prevAllocWatcher.IsWaiting() 914 } 915 916 // isShuttingDown returns true if the alloc runner is in a shutdown state 917 // due to a call to Shutdown() or Destroy() 918 func (ar *allocRunner) isShuttingDown() bool { 919 ar.destroyedLock.Lock() 920 defer ar.destroyedLock.Unlock() 921 return ar.shutdownLaunched 922 } 923 924 // DestroyCh is a channel that is closed when an allocrunner is closed due to 925 // an explicit call to Destroy(). 926 func (ar *allocRunner) DestroyCh() <-chan struct{} { 927 return ar.destroyCh 928 } 929 930 // ShutdownCh is a channel that is closed when an allocrunner is closed due to 931 // either an explicit call to Shutdown(), or Destroy(). 932 func (ar *allocRunner) ShutdownCh() <-chan struct{} { 933 return ar.shutdownCh 934 } 935 936 // Shutdown AllocRunner gracefully. Asynchronously shuts down all TaskRunners. 937 // Tasks are unaffected and may be restored. 938 // When the destroy action is completed, it will close ShutdownCh(). 939 func (ar *allocRunner) Shutdown() { 940 ar.destroyedLock.Lock() 941 defer ar.destroyedLock.Unlock() 942 943 // Destroy is a superset of Shutdown so there's nothing to do if this 944 // has already been destroyed. 945 if ar.destroyed { 946 return 947 } 948 949 // Destroy is a superset of Shutdown so if it's been marked for destruction, 950 // don't try and shutdown in parallel. If shutdown has been launched, don't 951 // try again. 952 if ar.destroyLaunched || ar.shutdownLaunched { 953 return 954 } 955 956 ar.shutdownLaunched = true 957 958 go func() { 959 ar.logger.Trace("shutting down") 960 961 // Shutdown tasks gracefully if they were run 962 wg := sync.WaitGroup{} 963 for _, tr := range ar.tasks { 964 wg.Add(1) 965 go func(tr *taskrunner.TaskRunner) { 966 tr.Shutdown() 967 wg.Done() 968 }(tr) 969 } 970 wg.Wait() 971 972 // Wait for Run to exit 973 <-ar.waitCh 974 975 // Run shutdown hooks 976 ar.shutdownHooks() 977 978 // Wait for updater to finish its final run 979 <-ar.taskStateUpdateHandlerCh 980 981 ar.destroyedLock.Lock() 982 ar.shutdown = true 983 close(ar.shutdownCh) 984 ar.destroyedLock.Unlock() 985 }() 986 } 987 988 // IsMigrating returns true if the alloc runner is migrating data from its 989 // previous allocation. 990 // 991 // This method is safe for calling concurrently with Run(). 992 func (ar *allocRunner) IsMigrating() bool { 993 return ar.prevAllocMigrator.IsMigrating() 994 } 995 996 func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter { 997 return ar 998 } 999 1000 // LatestAllocStats returns the latest stats for an allocation. If taskFilter 1001 // is set, only stats for that task -- if it exists -- are returned. 1002 func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) { 1003 astat := &cstructs.AllocResourceUsage{ 1004 Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)), 1005 ResourceUsage: &cstructs.ResourceUsage{ 1006 MemoryStats: &cstructs.MemoryStats{}, 1007 CpuStats: &cstructs.CpuStats{}, 1008 DeviceStats: []*device.DeviceGroupStats{}, 1009 }, 1010 } 1011 1012 for name, tr := range ar.tasks { 1013 if taskFilter != "" && taskFilter != name { 1014 // Getting stats for a particular task and its not this one! 1015 continue 1016 } 1017 1018 if usage := tr.LatestResourceUsage(); usage != nil { 1019 astat.Tasks[name] = usage 1020 astat.ResourceUsage.Add(usage.ResourceUsage) 1021 if usage.Timestamp > astat.Timestamp { 1022 astat.Timestamp = usage.Timestamp 1023 } 1024 } 1025 } 1026 1027 return astat, nil 1028 } 1029 1030 func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventHandler { 1031 if tr, ok := ar.tasks[taskName]; ok { 1032 return func(ev *drivers.TaskEvent) { 1033 tr.EmitEvent(&structs.TaskEvent{ 1034 Type: structs.TaskDriverMessage, 1035 Time: ev.Timestamp.UnixNano(), 1036 Details: ev.Annotations, 1037 DriverMessage: ev.Message, 1038 }) 1039 } 1040 } 1041 return nil 1042 } 1043 1044 // RestartTask signalls the task runner for the provided task to restart. 1045 func (ar *allocRunner) RestartTask(taskName string, taskEvent *structs.TaskEvent) error { 1046 tr, ok := ar.tasks[taskName] 1047 if !ok { 1048 return fmt.Errorf("Could not find task runner for task: %s", taskName) 1049 } 1050 1051 return tr.Restart(context.TODO(), taskEvent, false) 1052 } 1053 1054 // Restart satisfies the WorkloadRestarter interface restarts all task runners 1055 // concurrently 1056 func (ar *allocRunner) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error { 1057 waitCh := make(chan struct{}) 1058 var err *multierror.Error 1059 var errMutex sync.Mutex 1060 1061 go func() { 1062 var wg sync.WaitGroup 1063 defer close(waitCh) 1064 for tn, tr := range ar.tasks { 1065 wg.Add(1) 1066 go func(taskName string, r agentconsul.WorkloadRestarter) { 1067 defer wg.Done() 1068 e := r.Restart(ctx, event, failure) 1069 if e != nil { 1070 errMutex.Lock() 1071 defer errMutex.Unlock() 1072 err = multierror.Append(err, fmt.Errorf("failed to restart task %s: %v", taskName, e)) 1073 } 1074 }(tn, tr) 1075 } 1076 wg.Wait() 1077 }() 1078 1079 select { 1080 case <-waitCh: 1081 case <-ctx.Done(): 1082 } 1083 1084 return err.ErrorOrNil() 1085 } 1086 1087 // RestartAll signalls all task runners in the allocation to restart and passes 1088 // a copy of the task event to each restart event. 1089 // Returns any errors in a concatenated form. 1090 func (ar *allocRunner) RestartAll(taskEvent *structs.TaskEvent) error { 1091 var err *multierror.Error 1092 1093 for tn := range ar.tasks { 1094 rerr := ar.RestartTask(tn, taskEvent.Copy()) 1095 if rerr != nil { 1096 err = multierror.Append(err, rerr) 1097 } 1098 } 1099 1100 return err.ErrorOrNil() 1101 } 1102 1103 // Signal sends a signal request to task runners inside an allocation. If the 1104 // taskName is empty, then it is sent to all tasks. 1105 func (ar *allocRunner) Signal(taskName, signal string) error { 1106 event := structs.NewTaskEvent(structs.TaskSignaling).SetSignalText(signal) 1107 1108 if taskName != "" { 1109 tr, ok := ar.tasks[taskName] 1110 if !ok { 1111 return fmt.Errorf("Task not found") 1112 } 1113 1114 return tr.Signal(event, signal) 1115 } 1116 1117 var err *multierror.Error 1118 1119 for tn, tr := range ar.tasks { 1120 rerr := tr.Signal(event.Copy(), signal) 1121 if rerr != nil { 1122 err = multierror.Append(err, fmt.Errorf("Failed to signal task: %s, err: %v", tn, rerr)) 1123 } 1124 } 1125 1126 return err.ErrorOrNil() 1127 } 1128 1129 func (ar *allocRunner) GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler { 1130 tr, ok := ar.tasks[taskName] 1131 if !ok { 1132 return nil 1133 } 1134 1135 return tr.TaskExecHandler() 1136 } 1137 1138 func (ar *allocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) { 1139 tr, ok := ar.tasks[taskName] 1140 if !ok { 1141 return nil, fmt.Errorf("task not found") 1142 } 1143 1144 return tr.DriverCapabilities() 1145 }