github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allocrunner/alloc_runner.go (about) 1 package allocrunner 2 3 import ( 4 "context" 5 "fmt" 6 "path/filepath" 7 "sync" 8 "time" 9 10 log "github.com/hashicorp/go-hclog" 11 multierror "github.com/hashicorp/go-multierror" 12 "github.com/hashicorp/nomad/client/allocdir" 13 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 14 "github.com/hashicorp/nomad/client/allocrunner/state" 15 "github.com/hashicorp/nomad/client/allocrunner/taskrunner" 16 "github.com/hashicorp/nomad/client/allocwatcher" 17 "github.com/hashicorp/nomad/client/config" 18 "github.com/hashicorp/nomad/client/consul" 19 "github.com/hashicorp/nomad/client/devicemanager" 20 "github.com/hashicorp/nomad/client/dynamicplugins" 21 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 22 "github.com/hashicorp/nomad/client/pluginmanager/csimanager" 23 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 24 cstate "github.com/hashicorp/nomad/client/state" 25 cstructs "github.com/hashicorp/nomad/client/structs" 26 "github.com/hashicorp/nomad/client/vaultclient" 27 agentconsul "github.com/hashicorp/nomad/command/agent/consul" 28 "github.com/hashicorp/nomad/helper" 29 "github.com/hashicorp/nomad/nomad/structs" 30 "github.com/hashicorp/nomad/plugins/device" 31 "github.com/hashicorp/nomad/plugins/drivers" 32 ) 33 34 // allocRunner is used to run all the tasks in a given allocation 35 type allocRunner struct { 36 // id is the ID of the allocation. Can be accessed without a lock 37 id string 38 39 // Logger is the logger for the alloc runner. 40 logger log.Logger 41 42 clientConfig *config.Config 43 44 // stateUpdater is used to emit updated alloc state 45 stateUpdater cinterfaces.AllocStateHandler 46 47 // taskStateUpdatedCh is ticked whenever task state as changed. Must 48 // have len==1 to allow nonblocking notification of state updates while 49 // the goroutine is already processing a previous update. 50 taskStateUpdatedCh chan struct{} 51 52 // taskStateUpdateHandlerCh is closed when the task state handling 53 // goroutine exits. It is unsafe to destroy the local allocation state 54 // before this goroutine exits. 55 taskStateUpdateHandlerCh chan struct{} 56 57 // allocUpdatedCh is a channel that is used to stream allocation updates into 58 // the allocUpdate handler. Must have len==1 to allow nonblocking notification 59 // of new allocation updates while the goroutine is processing a previous 60 // update. 61 allocUpdatedCh chan *structs.Allocation 62 63 // consulClient is the client used by the consul service hook for 64 // registering services and checks 65 consulClient consul.ConsulServiceAPI 66 67 // consulProxiesClient is the client used by the envoy version hook for 68 // looking up supported envoy versions of the consul agent. 69 consulProxiesClient consul.SupportedProxiesAPI 70 71 // sidsClient is the client used by the service identity hook for 72 // managing SI tokens 73 sidsClient consul.ServiceIdentityAPI 74 75 // vaultClient is the used to manage Vault tokens 76 vaultClient vaultclient.VaultClient 77 78 // waitCh is closed when the Run loop has exited 79 waitCh chan struct{} 80 81 // destroyed is true when the Run loop has exited, postrun hooks have 82 // run, and alloc runner has been destroyed. Must acquire destroyedLock 83 // to access. 84 destroyed bool 85 86 // destroyCh is closed when the Run loop has exited, postrun hooks have 87 // run, and alloc runner has been destroyed. 88 destroyCh chan struct{} 89 90 // shutdown is true when the Run loop has exited, and shutdown hooks have 91 // run. Must acquire destroyedLock to access. 92 shutdown bool 93 94 // shutdownCh is closed when the Run loop has exited, and shutdown hooks 95 // have run. 96 shutdownCh chan struct{} 97 98 // destroyLaunched is true if Destroy has been called. Must acquire 99 // destroyedLock to access. 100 destroyLaunched bool 101 102 // shutdownLaunched is true if Shutdown has been called. Must acquire 103 // destroyedLock to access. 104 shutdownLaunched bool 105 106 // destroyedLock guards destroyed, destroyLaunched, shutdownLaunched, 107 // and serializes Shutdown/Destroy calls. 108 destroyedLock sync.Mutex 109 110 // Alloc captures the allocation being run. 111 alloc *structs.Allocation 112 allocLock sync.RWMutex 113 114 // state is the alloc runner's state 115 state *state.State 116 stateLock sync.RWMutex 117 118 stateDB cstate.StateDB 119 120 // allocDir is used to build the allocations directory structure. 121 allocDir *allocdir.AllocDir 122 123 // runnerHooks are alloc runner lifecycle hooks that should be run on state 124 // transistions. 125 runnerHooks []interfaces.RunnerHook 126 127 // hookState is the output of allocrunner hooks 128 hookState *cstructs.AllocHookResources 129 hookStateMu sync.RWMutex 130 131 // tasks are the set of task runners 132 tasks map[string]*taskrunner.TaskRunner 133 134 // deviceStatsReporter is used to lookup resource usage for alloc devices 135 deviceStatsReporter cinterfaces.DeviceStatsReporter 136 137 // allocBroadcaster sends client allocation updates to all listeners 138 allocBroadcaster *cstructs.AllocBroadcaster 139 140 // prevAllocWatcher allows waiting for any previous or preempted allocations 141 // to exit 142 prevAllocWatcher allocwatcher.PrevAllocWatcher 143 144 // prevAllocMigrator allows the migration of a previous allocations alloc dir. 145 prevAllocMigrator allocwatcher.PrevAllocMigrator 146 147 // dynamicRegistry contains all locally registered dynamic plugins (e.g csi 148 // plugins). 149 dynamicRegistry dynamicplugins.Registry 150 151 // csiManager is used to wait for CSI Volumes to be attached, and by the task 152 // runner to manage their mounting 153 csiManager csimanager.Manager 154 155 // devicemanager is used to mount devices as well as lookup device 156 // statistics 157 devicemanager devicemanager.Manager 158 159 // driverManager is responsible for dispensing driver plugins and registering 160 // event handlers 161 driverManager drivermanager.Manager 162 163 // serversContactedCh is passed to TaskRunners so they can detect when 164 // servers have been contacted for the first time in case of a failed 165 // restore. 166 serversContactedCh chan struct{} 167 168 taskHookCoordinator *taskHookCoordinator 169 170 // rpcClient is the RPC Client that should be used by the allocrunner and its 171 // hooks to communicate with Nomad Servers. 172 rpcClient RPCer 173 } 174 175 // RPCer is the interface needed by hooks to make RPC calls. 176 type RPCer interface { 177 RPC(method string, args interface{}, reply interface{}) error 178 } 179 180 // NewAllocRunner returns a new allocation runner. 181 func NewAllocRunner(config *Config) (*allocRunner, error) { 182 alloc := config.Alloc 183 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 184 if tg == nil { 185 return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup) 186 } 187 188 ar := &allocRunner{ 189 id: alloc.ID, 190 alloc: alloc, 191 clientConfig: config.ClientConfig, 192 consulClient: config.Consul, 193 consulProxiesClient: config.ConsulProxies, 194 sidsClient: config.ConsulSI, 195 vaultClient: config.Vault, 196 tasks: make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)), 197 waitCh: make(chan struct{}), 198 destroyCh: make(chan struct{}), 199 shutdownCh: make(chan struct{}), 200 state: &state.State{}, 201 stateDB: config.StateDB, 202 stateUpdater: config.StateUpdater, 203 taskStateUpdatedCh: make(chan struct{}, 1), 204 taskStateUpdateHandlerCh: make(chan struct{}), 205 allocUpdatedCh: make(chan *structs.Allocation, 1), 206 deviceStatsReporter: config.DeviceStatsReporter, 207 prevAllocWatcher: config.PrevAllocWatcher, 208 prevAllocMigrator: config.PrevAllocMigrator, 209 dynamicRegistry: config.DynamicRegistry, 210 csiManager: config.CSIManager, 211 devicemanager: config.DeviceManager, 212 driverManager: config.DriverManager, 213 serversContactedCh: config.ServersContactedCh, 214 rpcClient: config.RPCClient, 215 } 216 217 // Create the logger based on the allocation ID 218 ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID) 219 220 // Create alloc broadcaster 221 ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger) 222 223 // Create alloc dir 224 ar.allocDir = allocdir.NewAllocDir(ar.logger, filepath.Join(config.ClientConfig.AllocDir, alloc.ID)) 225 226 ar.taskHookCoordinator = newTaskHookCoordinator(ar.logger, tg.Tasks) 227 228 // Initialize the runners hooks. 229 if err := ar.initRunnerHooks(config.ClientConfig); err != nil { 230 return nil, err 231 } 232 233 // Create the TaskRunners 234 if err := ar.initTaskRunners(tg.Tasks); err != nil { 235 return nil, err 236 } 237 238 return ar, nil 239 } 240 241 // initTaskRunners creates task runners but does *not* run them. 242 func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error { 243 for _, task := range tasks { 244 trConfig := &taskrunner.Config{ 245 Alloc: ar.alloc, 246 ClientConfig: ar.clientConfig, 247 Task: task, 248 TaskDir: ar.allocDir.NewTaskDir(task.Name), 249 Logger: ar.logger, 250 StateDB: ar.stateDB, 251 StateUpdater: ar, 252 DynamicRegistry: ar.dynamicRegistry, 253 Consul: ar.consulClient, 254 ConsulProxies: ar.consulProxiesClient, 255 ConsulSI: ar.sidsClient, 256 Vault: ar.vaultClient, 257 DeviceStatsReporter: ar.deviceStatsReporter, 258 CSIManager: ar.csiManager, 259 DeviceManager: ar.devicemanager, 260 DriverManager: ar.driverManager, 261 ServersContactedCh: ar.serversContactedCh, 262 StartConditionMetCtx: ar.taskHookCoordinator.startConditionForTask(task), 263 } 264 265 // Create, but do not Run, the task runner 266 tr, err := taskrunner.NewTaskRunner(trConfig) 267 if err != nil { 268 return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err) 269 } 270 271 ar.tasks[task.Name] = tr 272 } 273 return nil 274 } 275 276 func (ar *allocRunner) WaitCh() <-chan struct{} { 277 return ar.waitCh 278 } 279 280 // Run the AllocRunner. Starts tasks if the alloc is non-terminal and closes 281 // WaitCh when it exits. Should be started in a goroutine. 282 func (ar *allocRunner) Run() { 283 // Close the wait channel on return 284 defer close(ar.waitCh) 285 286 // Start the task state update handler 287 go ar.handleTaskStateUpdates() 288 289 // Start the alloc update handler 290 go ar.handleAllocUpdates() 291 292 // If task update chan has been closed, that means we've been shutdown. 293 select { 294 case <-ar.taskStateUpdateHandlerCh: 295 return 296 default: 297 } 298 299 // When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run 300 // to perform any cleanup that's necessary, potentially not done prior to earlier termination 301 302 // Run the prestart hooks if non-terminal 303 if ar.shouldRun() { 304 if err := ar.prerun(); err != nil { 305 ar.logger.Error("prerun failed", "error", err) 306 307 for _, tr := range ar.tasks { 308 tr.MarkFailedDead(fmt.Sprintf("failed to setup alloc: %v", err)) 309 } 310 311 goto POST 312 } 313 } 314 315 // Run the runners (blocks until they exit) 316 ar.runTasks() 317 318 POST: 319 if ar.isShuttingDown() { 320 return 321 } 322 323 // Run the postrun hooks 324 if err := ar.postrun(); err != nil { 325 ar.logger.Error("postrun failed", "error", err) 326 } 327 328 } 329 330 // shouldRun returns true if the alloc is in a state that the alloc runner 331 // should run it. 332 func (ar *allocRunner) shouldRun() bool { 333 // Do not run allocs that are terminal 334 if ar.Alloc().TerminalStatus() { 335 ar.logger.Trace("alloc terminal; not running", 336 "desired_status", ar.Alloc().DesiredStatus, 337 "client_status", ar.Alloc().ClientStatus, 338 ) 339 return false 340 } 341 342 // It's possible that the alloc local state was marked terminal before 343 // the server copy of the alloc (checked above) was marked as terminal, 344 // so check the local state as well. 345 switch clientStatus := ar.AllocState().ClientStatus; clientStatus { 346 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost: 347 ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus) 348 return false 349 } 350 351 return true 352 } 353 354 // runTasks is used to run the task runners and block until they exit. 355 func (ar *allocRunner) runTasks() { 356 // Start all tasks 357 for _, task := range ar.tasks { 358 go task.Run() 359 } 360 361 // Block on all tasks except poststop tasks 362 for _, task := range ar.tasks { 363 if !task.IsPoststopTask() { 364 <-task.WaitCh() 365 } 366 } 367 368 // Signal poststop tasks to proceed to main runtime 369 ar.taskHookCoordinator.StartPoststopTasks() 370 371 // Wait for poststop tasks to finish before proceeding 372 for _, task := range ar.tasks { 373 if task.IsPoststopTask() { 374 <-task.WaitCh() 375 } 376 } 377 } 378 379 // Alloc returns the current allocation being run by this runner as sent by the 380 // server. This view of the allocation does not have updated task states. 381 func (ar *allocRunner) Alloc() *structs.Allocation { 382 ar.allocLock.RLock() 383 defer ar.allocLock.RUnlock() 384 return ar.alloc 385 } 386 387 func (ar *allocRunner) setAlloc(updated *structs.Allocation) { 388 ar.allocLock.Lock() 389 ar.alloc = updated 390 ar.allocLock.Unlock() 391 } 392 393 // GetAllocDir returns the alloc dir which is safe for concurrent use. 394 func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir { 395 return ar.allocDir 396 } 397 398 // Restore state from database. Must be called after NewAllocRunner but before 399 // Run. 400 func (ar *allocRunner) Restore() error { 401 // Retrieve deployment status to avoid reseting it across agent 402 // restarts. Once a deployment status is set Nomad no longer monitors 403 // alloc health, so we must persist deployment state across restarts. 404 ds, err := ar.stateDB.GetDeploymentStatus(ar.id) 405 if err != nil { 406 return err 407 } 408 409 ns, err := ar.stateDB.GetNetworkStatus(ar.id) 410 if err != nil { 411 return err 412 } 413 414 ar.stateLock.Lock() 415 ar.state.DeploymentStatus = ds 416 ar.state.NetworkStatus = ns 417 ar.stateLock.Unlock() 418 419 states := make(map[string]*structs.TaskState) 420 421 // Restore task runners 422 for _, tr := range ar.tasks { 423 if err := tr.Restore(); err != nil { 424 return err 425 } 426 states[tr.Task().Name] = tr.TaskState() 427 } 428 429 ar.taskHookCoordinator.taskStateUpdated(states) 430 431 return nil 432 } 433 434 // persistDeploymentStatus stores AllocDeploymentStatus. 435 func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) { 436 if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil { 437 // While any persistence errors are very bad, the worst case 438 // scenario for failing to persist deployment status is that if 439 // the agent is restarted it will monitor the deployment status 440 // again. This could cause a deployment's status to change when 441 // that shouldn't happen. However, allowing that seems better 442 // than failing the entire allocation. 443 ar.logger.Error("error storing deployment status", "error", err) 444 } 445 } 446 447 // TaskStateUpdated is called by TaskRunner when a task's state has been 448 // updated. It does not process the update synchronously but instead notifies a 449 // goroutine the state has change. Since processing the state change may cause 450 // the task to be killed (thus change its state again) it cannot be done 451 // synchronously as it would cause a deadlock due to reentrancy. 452 // 453 // The goroutine is used to compute changes to the alloc's ClientStatus and to 454 // update the server with the new state. 455 func (ar *allocRunner) TaskStateUpdated() { 456 select { 457 case ar.taskStateUpdatedCh <- struct{}{}: 458 default: 459 // already pending updates 460 } 461 } 462 463 // handleTaskStateUpdates must be run in goroutine as it monitors 464 // taskStateUpdatedCh for task state update notifications and processes task 465 // states. 466 // 467 // Processing task state updates must be done in a goroutine as it may have to 468 // kill tasks which causes further task state updates. 469 func (ar *allocRunner) handleTaskStateUpdates() { 470 defer close(ar.taskStateUpdateHandlerCh) 471 472 hasSidecars := hasSidecarTasks(ar.tasks) 473 474 for done := false; !done; { 475 select { 476 case <-ar.taskStateUpdatedCh: 477 case <-ar.waitCh: 478 // Run has exited, sync once more to ensure final 479 // states are collected. 480 done = true 481 } 482 483 ar.logger.Trace("handling task state update", "done", done) 484 485 // Set with the appropriate event if task runners should be 486 // killed. 487 var killEvent *structs.TaskEvent 488 489 // If task runners should be killed, this is set to the task 490 // name whose fault it is. 491 killTask := "" 492 493 // Task state has been updated; gather the state of the other tasks 494 trNum := len(ar.tasks) 495 liveRunners := make([]*taskrunner.TaskRunner, 0, trNum) 496 states := make(map[string]*structs.TaskState, trNum) 497 498 for name, tr := range ar.tasks { 499 state := tr.TaskState() 500 states[name] = state 501 502 if tr.IsPoststopTask() { 503 continue 504 } 505 506 // Capture live task runners in case we need to kill them 507 if state.State != structs.TaskStateDead { 508 liveRunners = append(liveRunners, tr) 509 continue 510 } 511 512 // Task is dead, determine if other tasks should be killed 513 if state.Failed { 514 // Only set failed event if no event has been 515 // set yet to give dead leaders priority. 516 if killEvent == nil { 517 killTask = name 518 killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed). 519 SetFailedSibling(name) 520 } 521 } else if tr.IsLeader() { 522 killEvent = structs.NewTaskEvent(structs.TaskLeaderDead) 523 } 524 } 525 526 // if all live runners are sidecars - kill alloc 527 if killEvent == nil && hasSidecars && !hasNonSidecarTasks(liveRunners) { 528 killEvent = structs.NewTaskEvent(structs.TaskMainDead) 529 } 530 531 // If there's a kill event set and live runners, kill them 532 if killEvent != nil && len(liveRunners) > 0 { 533 534 // Log kill reason 535 switch killEvent.Type { 536 case structs.TaskLeaderDead: 537 ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask) 538 case structs.TaskMainDead: 539 ar.logger.Debug("main tasks dead, destroying all sidecar tasks") 540 default: 541 ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask) 542 } 543 544 // Emit kill event for live runners 545 for _, tr := range liveRunners { 546 tr.EmitEvent(killEvent) 547 } 548 549 // Kill 'em all 550 states = ar.killTasks() 551 552 // Wait for TaskRunners to exit before continuing to 553 // prevent looping before TaskRunners have transitioned 554 // to Dead. 555 for _, tr := range liveRunners { 556 ar.logger.Info("killing task: ", tr.Task().Name) 557 select { 558 case <-tr.WaitCh(): 559 case <-ar.waitCh: 560 } 561 } 562 } 563 564 ar.taskHookCoordinator.taskStateUpdated(states) 565 566 // Get the client allocation 567 calloc := ar.clientAlloc(states) 568 569 // Update the server 570 ar.stateUpdater.AllocStateUpdated(calloc) 571 572 // Broadcast client alloc to listeners 573 ar.allocBroadcaster.Send(calloc) 574 } 575 } 576 577 // killTasks kills all task runners, leader (if there is one) first. Errors are 578 // logged except taskrunner.ErrTaskNotRunning which is ignored. Task states 579 // after Kill has been called are returned. 580 func (ar *allocRunner) killTasks() map[string]*structs.TaskState { 581 var mu sync.Mutex 582 states := make(map[string]*structs.TaskState, len(ar.tasks)) 583 584 // run alloc prekill hooks 585 ar.preKillHooks() 586 587 // Kill leader first, synchronously 588 for name, tr := range ar.tasks { 589 if !tr.IsLeader() { 590 continue 591 } 592 593 taskEvent := structs.NewTaskEvent(structs.TaskKilling) 594 taskEvent.SetKillTimeout(tr.Task().KillTimeout) 595 err := tr.Kill(context.TODO(), taskEvent) 596 if err != nil && err != taskrunner.ErrTaskNotRunning { 597 ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) 598 } 599 600 state := tr.TaskState() 601 states[name] = state 602 break 603 } 604 605 // Kill the rest concurrently 606 wg := sync.WaitGroup{} 607 for name, tr := range ar.tasks { 608 // Filter out poststop tasks so they run after all the other tasks are killed 609 if tr.IsLeader() || tr.IsPoststopTask() { 610 continue 611 } 612 613 wg.Add(1) 614 go func(name string, tr *taskrunner.TaskRunner) { 615 defer wg.Done() 616 taskEvent := structs.NewTaskEvent(structs.TaskKilling) 617 taskEvent.SetKillTimeout(tr.Task().KillTimeout) 618 err := tr.Kill(context.TODO(), taskEvent) 619 if err != nil && err != taskrunner.ErrTaskNotRunning { 620 ar.logger.Warn("error stopping task", "error", err, "task_name", name) 621 } 622 623 state := tr.TaskState() 624 mu.Lock() 625 states[name] = state 626 mu.Unlock() 627 }(name, tr) 628 } 629 wg.Wait() 630 631 return states 632 } 633 634 // clientAlloc takes in the task states and returns an Allocation populated 635 // with Client specific fields 636 func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation { 637 ar.stateLock.Lock() 638 defer ar.stateLock.Unlock() 639 640 // store task states for AllocState to expose 641 ar.state.TaskStates = taskStates 642 643 a := &structs.Allocation{ 644 ID: ar.id, 645 TaskStates: taskStates, 646 } 647 648 if d := ar.state.DeploymentStatus; d != nil { 649 a.DeploymentStatus = d.Copy() 650 } 651 652 // Compute the ClientStatus 653 if ar.state.ClientStatus != "" { 654 // The client status is being forced 655 a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription 656 } else { 657 a.ClientStatus, a.ClientDescription = getClientStatus(taskStates) 658 } 659 660 // If the allocation is terminal, make sure all required fields are properly 661 // set. 662 if a.ClientTerminalStatus() { 663 alloc := ar.Alloc() 664 665 // If we are part of a deployment and the alloc has failed, mark the 666 // alloc as unhealthy. This guards against the watcher not be started. 667 // If the health status is already set then terminal allocations should not 668 if a.ClientStatus == structs.AllocClientStatusFailed && 669 alloc.DeploymentID != "" && !a.DeploymentStatus.HasHealth() { 670 a.DeploymentStatus = &structs.AllocDeploymentStatus{ 671 Healthy: helper.BoolToPtr(false), 672 } 673 } 674 675 // Make sure we have marked the finished at for every task. This is used 676 // to calculate the reschedule time for failed allocations. 677 now := time.Now() 678 for taskName := range ar.tasks { 679 ts, ok := a.TaskStates[taskName] 680 if !ok { 681 ts = &structs.TaskState{} 682 a.TaskStates[taskName] = ts 683 } 684 if ts.FinishedAt.IsZero() { 685 ts.FinishedAt = now 686 } 687 } 688 } 689 690 // Set the NetworkStatus and default DNSConfig if one is not returned from the client 691 netStatus := ar.state.NetworkStatus 692 if netStatus != nil { 693 a.NetworkStatus = netStatus 694 } else { 695 a.NetworkStatus = new(structs.AllocNetworkStatus) 696 } 697 698 if a.NetworkStatus.DNS == nil { 699 alloc := ar.Alloc() 700 nws := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Networks 701 if len(nws) > 0 { 702 a.NetworkStatus.DNS = nws[0].DNS.Copy() 703 } 704 } 705 706 return a 707 } 708 709 // getClientStatus takes in the task states for a given allocation and computes 710 // the client status and description 711 func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) { 712 var pending, running, dead, failed bool 713 for _, state := range taskStates { 714 switch state.State { 715 case structs.TaskStateRunning: 716 running = true 717 case structs.TaskStatePending: 718 pending = true 719 case structs.TaskStateDead: 720 if state.Failed { 721 failed = true 722 } else { 723 dead = true 724 } 725 } 726 } 727 728 // Determine the alloc status 729 if failed { 730 return structs.AllocClientStatusFailed, "Failed tasks" 731 } else if running { 732 return structs.AllocClientStatusRunning, "Tasks are running" 733 } else if pending { 734 return structs.AllocClientStatusPending, "No tasks have started" 735 } else if dead { 736 return structs.AllocClientStatusComplete, "All tasks have completed" 737 } 738 739 return "", "" 740 } 741 742 // SetClientStatus is a helper for forcing a specific client 743 // status on the alloc runner. This is used during restore errors 744 // when the task state can't be restored. 745 func (ar *allocRunner) SetClientStatus(clientStatus string) { 746 ar.stateLock.Lock() 747 defer ar.stateLock.Unlock() 748 ar.state.ClientStatus = clientStatus 749 } 750 751 func (ar *allocRunner) SetNetworkStatus(s *structs.AllocNetworkStatus) { 752 ar.stateLock.Lock() 753 defer ar.stateLock.Unlock() 754 ar.state.NetworkStatus = s.Copy() 755 } 756 757 func (ar *allocRunner) NetworkStatus() *structs.AllocNetworkStatus { 758 ar.stateLock.Lock() 759 defer ar.stateLock.Unlock() 760 return ar.state.NetworkStatus.Copy() 761 } 762 763 // AllocState returns a copy of allocation state including a snapshot of task 764 // states. 765 func (ar *allocRunner) AllocState() *state.State { 766 ar.stateLock.RLock() 767 state := ar.state.Copy() 768 ar.stateLock.RUnlock() 769 770 // If TaskStateUpdated has not been called yet, ar.state.TaskStates 771 // won't be set as it is not the canonical source of TaskStates. 772 if len(state.TaskStates) == 0 { 773 ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks)) 774 for k, tr := range ar.tasks { 775 state.TaskStates[k] = tr.TaskState() 776 } 777 } 778 779 // Generate alloc to get other state fields 780 alloc := ar.clientAlloc(state.TaskStates) 781 state.ClientStatus = alloc.ClientStatus 782 state.ClientDescription = alloc.ClientDescription 783 state.DeploymentStatus = alloc.DeploymentStatus 784 785 return state 786 } 787 788 // Update asyncronously updates the running allocation with a new version 789 // received from the server. 790 // When processing a new update, we will first attempt to drain stale updates 791 // from the queue, before appending the new one. 792 func (ar *allocRunner) Update(update *structs.Allocation) { 793 select { 794 // Drain queued update from the channel if possible, and check the modify 795 // index 796 case oldUpdate := <-ar.allocUpdatedCh: 797 // If the old update is newer than the replacement, then skip the new one 798 // and return. This case shouldn't happen, but may in the case of a bug 799 // elsewhere inside the system. 800 if oldUpdate.AllocModifyIndex > update.AllocModifyIndex { 801 ar.logger.Debug("Discarding allocation update due to newer alloc revision in queue", 802 "old_modify_index", oldUpdate.AllocModifyIndex, 803 "new_modify_index", update.AllocModifyIndex) 804 ar.allocUpdatedCh <- oldUpdate 805 return 806 } else { 807 ar.logger.Debug("Discarding allocation update", 808 "skipped_modify_index", oldUpdate.AllocModifyIndex, 809 "new_modify_index", update.AllocModifyIndex) 810 } 811 case <-ar.waitCh: 812 ar.logger.Trace("AllocRunner has terminated, skipping alloc update", 813 "modify_index", update.AllocModifyIndex) 814 return 815 default: 816 } 817 818 // Queue the new update 819 ar.allocUpdatedCh <- update 820 } 821 822 func (ar *allocRunner) handleAllocUpdates() { 823 for { 824 select { 825 case update := <-ar.allocUpdatedCh: 826 ar.handleAllocUpdate(update) 827 case <-ar.waitCh: 828 return 829 } 830 } 831 } 832 833 // This method sends the updated alloc to Run for serially processing updates. 834 // If there is already a pending update it will be discarded and replaced by 835 // the latest update. 836 func (ar *allocRunner) handleAllocUpdate(update *structs.Allocation) { 837 // Detect Stop updates 838 stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus() 839 840 // Update ar.alloc 841 ar.setAlloc(update) 842 843 // Run update hooks if not stopping or dead 844 if !update.TerminalStatus() { 845 if err := ar.update(update); err != nil { 846 ar.logger.Error("error running update hooks", "error", err) 847 } 848 849 } 850 851 // Update task runners 852 for _, tr := range ar.tasks { 853 tr.Update(update) 854 } 855 856 // If alloc is being terminated, kill all tasks, leader first 857 if stopping { 858 ar.killTasks() 859 } 860 861 } 862 863 func (ar *allocRunner) Listener() *cstructs.AllocListener { 864 return ar.allocBroadcaster.Listen() 865 } 866 867 func (ar *allocRunner) destroyImpl() { 868 // Stop any running tasks and persist states in case the client is 869 // shutdown before Destroy finishes. 870 states := ar.killTasks() 871 calloc := ar.clientAlloc(states) 872 ar.stateUpdater.AllocStateUpdated(calloc) 873 874 // Wait for tasks to exit and postrun hooks to finish 875 <-ar.waitCh 876 877 // Run destroy hooks 878 if err := ar.destroy(); err != nil { 879 ar.logger.Warn("error running destroy hooks", "error", err) 880 } 881 882 // Wait for task state update handler to exit before removing local 883 // state if Run() ran at all. 884 <-ar.taskStateUpdateHandlerCh 885 886 // Mark alloc as destroyed 887 ar.destroyedLock.Lock() 888 889 // Cleanup state db; while holding the lock to avoid 890 // a race periodic PersistState that may resurrect the alloc 891 if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil { 892 ar.logger.Warn("failed to delete allocation state", "error", err) 893 } 894 895 if !ar.shutdown { 896 ar.shutdown = true 897 close(ar.shutdownCh) 898 } 899 900 ar.destroyed = true 901 close(ar.destroyCh) 902 903 ar.destroyedLock.Unlock() 904 } 905 906 func (ar *allocRunner) PersistState() error { 907 ar.destroyedLock.Lock() 908 defer ar.destroyedLock.Unlock() 909 910 if ar.destroyed { 911 err := ar.stateDB.DeleteAllocationBucket(ar.id, cstate.WithBatchMode()) 912 if err != nil { 913 ar.logger.Warn("failed to delete allocation bucket", "error", err) 914 } 915 return nil 916 } 917 918 // persist network status, wrapping in a func to release state lock as early as possible 919 err := func() error { 920 ar.stateLock.Lock() 921 defer ar.stateLock.Unlock() 922 if ar.state.NetworkStatus != nil { 923 err := ar.stateDB.PutNetworkStatus(ar.id, ar.state.NetworkStatus, cstate.WithBatchMode()) 924 if err != nil { 925 return err 926 } 927 } 928 return nil 929 }() 930 if err != nil { 931 return err 932 } 933 934 // TODO: consider persisting deployment state along with task status. 935 // While we study why only the alloc is persisted, I opted to maintain current 936 // behavior and not risk adding yet more IO calls unnecessarily. 937 return ar.stateDB.PutAllocation(ar.Alloc(), cstate.WithBatchMode()) 938 } 939 940 // Destroy the alloc runner by stopping it if it is still running and cleaning 941 // up all of its resources. 942 // 943 // This method is safe for calling concurrently with Run() and will cause it to 944 // exit (thus closing WaitCh). 945 // When the destroy action is completed, it will close DestroyCh(). 946 func (ar *allocRunner) Destroy() { 947 ar.destroyedLock.Lock() 948 defer ar.destroyedLock.Unlock() 949 950 if ar.destroyed { 951 // Only destroy once 952 return 953 } 954 955 if ar.destroyLaunched { 956 // Only dispatch a destroy once 957 return 958 } 959 960 ar.destroyLaunched = true 961 962 // Synchronize calls to shutdown/destroy 963 if ar.shutdownLaunched { 964 go func() { 965 ar.logger.Debug("Waiting for shutdown before destroying runner") 966 <-ar.shutdownCh 967 ar.destroyImpl() 968 }() 969 970 return 971 } 972 973 go ar.destroyImpl() 974 } 975 976 // IsDestroyed returns true if the alloc runner has been destroyed (stopped and 977 // garbage collected). 978 // 979 // This method is safe for calling concurrently with Run(). Callers must 980 // receive on WaitCh() to block until alloc runner has stopped and been 981 // destroyed. 982 func (ar *allocRunner) IsDestroyed() bool { 983 ar.destroyedLock.Lock() 984 defer ar.destroyedLock.Unlock() 985 return ar.destroyed 986 } 987 988 // IsWaiting returns true if the alloc runner is waiting for its previous 989 // allocation to terminate. 990 // 991 // This method is safe for calling concurrently with Run(). 992 func (ar *allocRunner) IsWaiting() bool { 993 return ar.prevAllocWatcher.IsWaiting() 994 } 995 996 // isShuttingDown returns true if the alloc runner is in a shutdown state 997 // due to a call to Shutdown() or Destroy() 998 func (ar *allocRunner) isShuttingDown() bool { 999 ar.destroyedLock.Lock() 1000 defer ar.destroyedLock.Unlock() 1001 return ar.shutdownLaunched 1002 } 1003 1004 // DestroyCh is a channel that is closed when an allocrunner is closed due to 1005 // an explicit call to Destroy(). 1006 func (ar *allocRunner) DestroyCh() <-chan struct{} { 1007 return ar.destroyCh 1008 } 1009 1010 // ShutdownCh is a channel that is closed when an allocrunner is closed due to 1011 // either an explicit call to Shutdown(), or Destroy(). 1012 func (ar *allocRunner) ShutdownCh() <-chan struct{} { 1013 return ar.shutdownCh 1014 } 1015 1016 // Shutdown AllocRunner gracefully. Asynchronously shuts down all TaskRunners. 1017 // Tasks are unaffected and may be restored. 1018 // When the destroy action is completed, it will close ShutdownCh(). 1019 func (ar *allocRunner) Shutdown() { 1020 ar.destroyedLock.Lock() 1021 defer ar.destroyedLock.Unlock() 1022 1023 // Destroy is a superset of Shutdown so there's nothing to do if this 1024 // has already been destroyed. 1025 if ar.destroyed { 1026 return 1027 } 1028 1029 // Destroy is a superset of Shutdown so if it's been marked for destruction, 1030 // don't try and shutdown in parallel. If shutdown has been launched, don't 1031 // try again. 1032 if ar.destroyLaunched || ar.shutdownLaunched { 1033 return 1034 } 1035 1036 ar.shutdownLaunched = true 1037 1038 go func() { 1039 ar.logger.Trace("shutting down") 1040 1041 // Shutdown tasks gracefully if they were run 1042 wg := sync.WaitGroup{} 1043 for _, tr := range ar.tasks { 1044 wg.Add(1) 1045 go func(tr *taskrunner.TaskRunner) { 1046 tr.Shutdown() 1047 wg.Done() 1048 }(tr) 1049 } 1050 wg.Wait() 1051 1052 // Wait for Run to exit 1053 <-ar.waitCh 1054 1055 // Run shutdown hooks 1056 ar.shutdownHooks() 1057 1058 // Wait for updater to finish its final run 1059 <-ar.taskStateUpdateHandlerCh 1060 1061 ar.destroyedLock.Lock() 1062 ar.shutdown = true 1063 close(ar.shutdownCh) 1064 ar.destroyedLock.Unlock() 1065 }() 1066 } 1067 1068 // IsMigrating returns true if the alloc runner is migrating data from its 1069 // previous allocation. 1070 // 1071 // This method is safe for calling concurrently with Run(). 1072 func (ar *allocRunner) IsMigrating() bool { 1073 return ar.prevAllocMigrator.IsMigrating() 1074 } 1075 1076 func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter { 1077 return ar 1078 } 1079 1080 // LatestAllocStats returns the latest stats for an allocation. If taskFilter 1081 // is set, only stats for that task -- if it exists -- are returned. 1082 func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) { 1083 astat := &cstructs.AllocResourceUsage{ 1084 Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)), 1085 ResourceUsage: &cstructs.ResourceUsage{ 1086 MemoryStats: &cstructs.MemoryStats{}, 1087 CpuStats: &cstructs.CpuStats{}, 1088 DeviceStats: []*device.DeviceGroupStats{}, 1089 }, 1090 } 1091 1092 for name, tr := range ar.tasks { 1093 if taskFilter != "" && taskFilter != name { 1094 // Getting stats for a particular task and its not this one! 1095 continue 1096 } 1097 1098 if usage := tr.LatestResourceUsage(); usage != nil { 1099 astat.Tasks[name] = usage 1100 astat.ResourceUsage.Add(usage.ResourceUsage) 1101 if usage.Timestamp > astat.Timestamp { 1102 astat.Timestamp = usage.Timestamp 1103 } 1104 } 1105 } 1106 1107 return astat, nil 1108 } 1109 1110 func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventHandler { 1111 if tr, ok := ar.tasks[taskName]; ok { 1112 return func(ev *drivers.TaskEvent) { 1113 tr.EmitEvent(&structs.TaskEvent{ 1114 Type: structs.TaskDriverMessage, 1115 Time: ev.Timestamp.UnixNano(), 1116 Details: ev.Annotations, 1117 DriverMessage: ev.Message, 1118 }) 1119 } 1120 } 1121 return nil 1122 } 1123 1124 // RestartTask signalls the task runner for the provided task to restart. 1125 func (ar *allocRunner) RestartTask(taskName string, taskEvent *structs.TaskEvent) error { 1126 tr, ok := ar.tasks[taskName] 1127 if !ok { 1128 return fmt.Errorf("Could not find task runner for task: %s", taskName) 1129 } 1130 1131 return tr.Restart(context.TODO(), taskEvent, false) 1132 } 1133 1134 // Restart satisfies the WorkloadRestarter interface restarts all task runners 1135 // concurrently 1136 func (ar *allocRunner) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error { 1137 waitCh := make(chan struct{}) 1138 var err *multierror.Error 1139 var errMutex sync.Mutex 1140 1141 // run alloc task restart hooks 1142 ar.taskRestartHooks() 1143 1144 go func() { 1145 var wg sync.WaitGroup 1146 defer close(waitCh) 1147 for tn, tr := range ar.tasks { 1148 wg.Add(1) 1149 go func(taskName string, r agentconsul.WorkloadRestarter) { 1150 defer wg.Done() 1151 e := r.Restart(ctx, event, failure) 1152 if e != nil { 1153 errMutex.Lock() 1154 defer errMutex.Unlock() 1155 err = multierror.Append(err, fmt.Errorf("failed to restart task %s: %v", taskName, e)) 1156 } 1157 }(tn, tr) 1158 } 1159 wg.Wait() 1160 }() 1161 1162 select { 1163 case <-waitCh: 1164 case <-ctx.Done(): 1165 } 1166 1167 return err.ErrorOrNil() 1168 } 1169 1170 // RestartAll signalls all task runners in the allocation to restart and passes 1171 // a copy of the task event to each restart event. 1172 // Returns any errors in a concatenated form. 1173 func (ar *allocRunner) RestartAll(taskEvent *structs.TaskEvent) error { 1174 var err *multierror.Error 1175 1176 // run alloc task restart hooks 1177 ar.taskRestartHooks() 1178 1179 for tn := range ar.tasks { 1180 rerr := ar.RestartTask(tn, taskEvent.Copy()) 1181 if rerr != nil { 1182 err = multierror.Append(err, rerr) 1183 } 1184 } 1185 1186 return err.ErrorOrNil() 1187 } 1188 1189 // Signal sends a signal request to task runners inside an allocation. If the 1190 // taskName is empty, then it is sent to all tasks. 1191 func (ar *allocRunner) Signal(taskName, signal string) error { 1192 event := structs.NewTaskEvent(structs.TaskSignaling).SetSignalText(signal) 1193 1194 if taskName != "" { 1195 tr, ok := ar.tasks[taskName] 1196 if !ok { 1197 return fmt.Errorf("Task not found") 1198 } 1199 1200 return tr.Signal(event, signal) 1201 } 1202 1203 var err *multierror.Error 1204 1205 for tn, tr := range ar.tasks { 1206 rerr := tr.Signal(event.Copy(), signal) 1207 if rerr != nil { 1208 err = multierror.Append(err, fmt.Errorf("Failed to signal task: %s, err: %v", tn, rerr)) 1209 } 1210 } 1211 1212 return err.ErrorOrNil() 1213 } 1214 1215 func (ar *allocRunner) GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler { 1216 tr, ok := ar.tasks[taskName] 1217 if !ok { 1218 return nil 1219 } 1220 1221 return tr.TaskExecHandler() 1222 } 1223 1224 func (ar *allocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) { 1225 tr, ok := ar.tasks[taskName] 1226 if !ok { 1227 return nil, fmt.Errorf("task not found") 1228 } 1229 1230 return tr.DriverCapabilities() 1231 }