github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allocrunner/taskrunner/task_runner.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "strings" 8 "sync" 9 "time" 10 11 metrics "github.com/armon/go-metrics" 12 log "github.com/hashicorp/go-hclog" 13 multierror "github.com/hashicorp/go-multierror" 14 "github.com/hashicorp/hcl/v2/hcldec" 15 "github.com/hashicorp/nomad/client/allocdir" 16 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 17 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts" 18 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" 19 "github.com/hashicorp/nomad/client/config" 20 "github.com/hashicorp/nomad/client/consul" 21 "github.com/hashicorp/nomad/client/devicemanager" 22 "github.com/hashicorp/nomad/client/dynamicplugins" 23 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 24 "github.com/hashicorp/nomad/client/pluginmanager/csimanager" 25 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 26 cstate "github.com/hashicorp/nomad/client/state" 27 cstructs "github.com/hashicorp/nomad/client/structs" 28 "github.com/hashicorp/nomad/client/taskenv" 29 "github.com/hashicorp/nomad/client/vaultclient" 30 "github.com/hashicorp/nomad/helper/pluginutils/hclspecutils" 31 "github.com/hashicorp/nomad/helper/pluginutils/hclutils" 32 "github.com/hashicorp/nomad/helper/uuid" 33 "github.com/hashicorp/nomad/nomad/structs" 34 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 35 "github.com/hashicorp/nomad/plugins/drivers" 36 ) 37 38 const ( 39 // defaultMaxEvents is the default max capacity for task events on the 40 // task state. Overrideable for testing. 41 defaultMaxEvents = 10 42 43 // killBackoffBaseline is the baseline time for exponential backoff while 44 // killing a task. 45 killBackoffBaseline = 5 * time.Second 46 47 // killBackoffLimit is the limit of the exponential backoff for killing 48 // the task. 49 killBackoffLimit = 2 * time.Minute 50 51 // killFailureLimit is how many times we will attempt to kill a task before 52 // giving up and potentially leaking resources. 53 killFailureLimit = 5 54 55 // triggerUpdateChCap is the capacity for the triggerUpdateCh used for 56 // triggering updates. It should be exactly 1 as even if multiple 57 // updates have come in since the last one was handled, we only need to 58 // handle the last one. 59 triggerUpdateChCap = 1 60 ) 61 62 type TaskRunner struct { 63 // allocID, taskName, taskLeader, and taskResources are immutable so these fields may 64 // be accessed without locks 65 allocID string 66 taskName string 67 taskLeader bool 68 taskResources *structs.AllocatedTaskResources 69 70 alloc *structs.Allocation 71 allocLock sync.Mutex 72 73 clientConfig *config.Config 74 75 // stateUpdater is used to emit updated task state 76 stateUpdater interfaces.TaskStateHandler 77 78 // state captures the state of the task for updating the allocation 79 // Must acquire stateLock to access. 80 state *structs.TaskState 81 82 // localState captures the node-local state of the task for when the 83 // Nomad agent restarts. 84 // Must acquire stateLock to access. 85 localState *state.LocalState 86 87 // stateLock must be acquired when accessing state or localState. 88 stateLock sync.RWMutex 89 90 // stateDB is for persisting localState and taskState 91 stateDB cstate.StateDB 92 93 // shutdownCtx is used to exit the TaskRunner *without* affecting task state. 94 shutdownCtx context.Context 95 96 // shutdownCtxCancel causes the TaskRunner to exit immediately without 97 // affecting task state. Useful for testing or graceful agent shutdown. 98 shutdownCtxCancel context.CancelFunc 99 100 // killCtx is the task runner's context representing the tasks's lifecycle. 101 // The context is canceled when the task is killed. 102 killCtx context.Context 103 104 // killCtxCancel is called when killing a task. 105 killCtxCancel context.CancelFunc 106 107 // killErr is populated when killing a task. Access should be done use the 108 // getter/setter 109 killErr error 110 killErrLock sync.Mutex 111 112 // Logger is the logger for the task runner. 113 logger log.Logger 114 115 // triggerUpdateCh is ticked whenever update hooks need to be run and 116 // must be created with cap=1 to signal a pending update and prevent 117 // callers from deadlocking if the receiver has exited. 118 triggerUpdateCh chan struct{} 119 120 // waitCh is closed when the task runner has transitioned to a terminal 121 // state 122 waitCh chan struct{} 123 124 // driver is the driver for the task. 125 driver drivers.DriverPlugin 126 127 // driverCapabilities is the set capabilities the driver supports 128 driverCapabilities *drivers.Capabilities 129 130 // taskSchema is the hcl spec for the task driver configuration 131 taskSchema hcldec.Spec 132 133 // handleLock guards access to handle and handleResult 134 handleLock sync.Mutex 135 136 // handle to the running driver 137 handle *DriverHandle 138 139 // task is the task being run 140 task *structs.Task 141 taskLock sync.RWMutex 142 143 // taskDir is the directory structure for this task. 144 taskDir *allocdir.TaskDir 145 146 // envBuilder is used to build the task's environment 147 envBuilder *taskenv.Builder 148 149 // restartTracker is used to decide if the task should be restarted. 150 restartTracker *restarts.RestartTracker 151 152 // runnerHooks are task runner lifecycle hooks that should be run on state 153 // transistions. 154 runnerHooks []interfaces.TaskHook 155 156 // hookResources captures the resources provided by hooks 157 hookResources *hookResources 158 159 // consulClient is the client used by the consul service hook for 160 // registering services and checks 161 consulServiceClient consul.ConsulServiceAPI 162 163 // consulProxiesClient is the client used by the envoy version hook for 164 // asking consul what version of envoy nomad should inject into the connect 165 // sidecar or gateway task. 166 consulProxiesClient consul.SupportedProxiesAPI 167 168 // sidsClient is the client used by the service identity hook for managing 169 // service identity tokens 170 siClient consul.ServiceIdentityAPI 171 172 // vaultClient is the client to use to derive and renew Vault tokens 173 vaultClient vaultclient.VaultClient 174 175 // vaultToken is the current Vault token. It should be accessed with the 176 // getter. 177 vaultToken string 178 vaultTokenLock sync.Mutex 179 180 // baseLabels are used when emitting tagged metrics. All task runner metrics 181 // will have these tags, and optionally more. 182 baseLabels []metrics.Label 183 184 // logmonHookConfig is used to get the paths to the stdout and stderr fifos 185 // to be passed to the driver for task logging 186 logmonHookConfig *logmonHookConfig 187 188 // resourceUsage is written via UpdateStats and read via 189 // LatestResourceUsage. May be nil at all times. 190 resourceUsage *cstructs.TaskResourceUsage 191 resourceUsageLock sync.Mutex 192 193 // deviceStatsReporter is used to lookup resource usage for alloc devices 194 deviceStatsReporter cinterfaces.DeviceStatsReporter 195 196 // csiManager is used to manage the mounting of CSI volumes into tasks 197 csiManager csimanager.Manager 198 199 // devicemanager is used to mount devices as well as lookup device 200 // statistics 201 devicemanager devicemanager.Manager 202 203 // driverManager is used to dispense driver plugins and register event 204 // handlers 205 driverManager drivermanager.Manager 206 207 // dynamicRegistry is where dynamic plugins should be registered. 208 dynamicRegistry dynamicplugins.Registry 209 210 // maxEvents is the capacity of the TaskEvents on the TaskState. 211 // Defaults to defaultMaxEvents but overrideable for testing. 212 maxEvents int 213 214 // serversContactedCh is passed to TaskRunners so they can detect when 215 // GetClientAllocs has been called in case of a failed restore. 216 serversContactedCh <-chan struct{} 217 218 // startConditionMetCtx is done when TR should start the task 219 startConditionMetCtx <-chan struct{} 220 221 // waitOnServers defaults to false but will be set true if a restore 222 // fails and the Run method should wait until serversContactedCh is 223 // closed. 224 waitOnServers bool 225 226 networkIsolationLock sync.Mutex 227 networkIsolationSpec *drivers.NetworkIsolationSpec 228 229 allocHookResources *cstructs.AllocHookResources 230 } 231 232 type Config struct { 233 Alloc *structs.Allocation 234 ClientConfig *config.Config 235 Task *structs.Task 236 TaskDir *allocdir.TaskDir 237 Logger log.Logger 238 239 // Consul is the client to use for managing Consul service registrations 240 Consul consul.ConsulServiceAPI 241 242 // ConsulProxies is the client to use for looking up supported envoy versions 243 // from Consul. 244 ConsulProxies consul.SupportedProxiesAPI 245 246 // ConsulSI is the client to use for managing Consul SI tokens 247 ConsulSI consul.ServiceIdentityAPI 248 249 // DynamicRegistry is where dynamic plugins should be registered. 250 DynamicRegistry dynamicplugins.Registry 251 252 // Vault is the client to use to derive and renew Vault tokens 253 Vault vaultclient.VaultClient 254 255 // StateDB is used to store and restore state. 256 StateDB cstate.StateDB 257 258 // StateUpdater is used to emit updated task state 259 StateUpdater interfaces.TaskStateHandler 260 261 // deviceStatsReporter is used to lookup resource usage for alloc devices 262 DeviceStatsReporter cinterfaces.DeviceStatsReporter 263 264 // CSIManager is used to manage the mounting of CSI volumes into tasks 265 CSIManager csimanager.Manager 266 267 // DeviceManager is used to mount devices as well as lookup device 268 // statistics 269 DeviceManager devicemanager.Manager 270 271 // DriverManager is used to dispense driver plugins and register event 272 // handlers 273 DriverManager drivermanager.Manager 274 275 // ServersContactedCh is closed when the first GetClientAllocs call to 276 // servers succeeds and allocs are synced. 277 ServersContactedCh chan struct{} 278 279 // startConditionMetCtx is done when TR should start the task 280 StartConditionMetCtx <-chan struct{} 281 } 282 283 func NewTaskRunner(config *Config) (*TaskRunner, error) { 284 // Create a context for causing the runner to exit 285 trCtx, trCancel := context.WithCancel(context.Background()) 286 287 // Create a context for killing the runner 288 killCtx, killCancel := context.WithCancel(context.Background()) 289 290 // Initialize the environment builder 291 envBuilder := taskenv.NewBuilder( 292 config.ClientConfig.Node, 293 config.Alloc, 294 config.Task, 295 config.ClientConfig.Region, 296 ) 297 298 // Initialize state from alloc if it is set 299 tstate := structs.NewTaskState() 300 if ts := config.Alloc.TaskStates[config.Task.Name]; ts != nil { 301 tstate = ts.Copy() 302 } 303 304 tr := &TaskRunner{ 305 alloc: config.Alloc, 306 allocID: config.Alloc.ID, 307 clientConfig: config.ClientConfig, 308 task: config.Task, 309 taskDir: config.TaskDir, 310 taskName: config.Task.Name, 311 taskLeader: config.Task.Leader, 312 envBuilder: envBuilder, 313 dynamicRegistry: config.DynamicRegistry, 314 consulServiceClient: config.Consul, 315 consulProxiesClient: config.ConsulProxies, 316 siClient: config.ConsulSI, 317 vaultClient: config.Vault, 318 state: tstate, 319 localState: state.NewLocalState(), 320 stateDB: config.StateDB, 321 stateUpdater: config.StateUpdater, 322 deviceStatsReporter: config.DeviceStatsReporter, 323 killCtx: killCtx, 324 killCtxCancel: killCancel, 325 shutdownCtx: trCtx, 326 shutdownCtxCancel: trCancel, 327 triggerUpdateCh: make(chan struct{}, triggerUpdateChCap), 328 waitCh: make(chan struct{}), 329 csiManager: config.CSIManager, 330 devicemanager: config.DeviceManager, 331 driverManager: config.DriverManager, 332 maxEvents: defaultMaxEvents, 333 serversContactedCh: config.ServersContactedCh, 334 startConditionMetCtx: config.StartConditionMetCtx, 335 } 336 337 // Create the logger based on the allocation ID 338 tr.logger = config.Logger.Named("task_runner").With("task", config.Task.Name) 339 340 // Pull out the task's resources 341 ares := tr.alloc.AllocatedResources 342 if ares == nil { 343 return nil, fmt.Errorf("no task resources found on allocation") 344 } 345 346 tres, ok := ares.Tasks[tr.taskName] 347 if !ok { 348 return nil, fmt.Errorf("no task resources found on allocation") 349 } 350 tr.taskResources = tres 351 352 // Build the restart tracker. 353 rp := config.Task.RestartPolicy 354 if rp == nil { 355 tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup) 356 if tg == nil { 357 tr.logger.Error("alloc missing task group") 358 return nil, fmt.Errorf("alloc missing task group") 359 } 360 rp = tg.RestartPolicy 361 } 362 tr.restartTracker = restarts.NewRestartTracker(rp, tr.alloc.Job.Type, config.Task.Lifecycle) 363 364 // Get the driver 365 if err := tr.initDriver(); err != nil { 366 tr.logger.Error("failed to create driver", "error", err) 367 return nil, err 368 } 369 370 // Initialize the runners hooks. 371 tr.initHooks() 372 373 // Initialize base labels 374 tr.initLabels() 375 376 // Initialize initial task received event 377 tr.appendEvent(structs.NewTaskEvent(structs.TaskReceived)) 378 379 return tr, nil 380 } 381 382 func (tr *TaskRunner) initLabels() { 383 alloc := tr.Alloc() 384 tr.baseLabels = []metrics.Label{ 385 { 386 Name: "job", 387 Value: alloc.Job.Name, 388 }, 389 { 390 Name: "task_group", 391 Value: alloc.TaskGroup, 392 }, 393 { 394 Name: "alloc_id", 395 Value: tr.allocID, 396 }, 397 { 398 Name: "task", 399 Value: tr.taskName, 400 }, 401 { 402 Name: "namespace", 403 Value: tr.alloc.Namespace, 404 }, 405 } 406 407 if tr.alloc.Job.ParentID != "" { 408 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 409 Name: "parent_id", 410 Value: tr.alloc.Job.ParentID, 411 }) 412 if strings.Contains(tr.alloc.Job.Name, "/dispatch-") { 413 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 414 Name: "dispatch_id", 415 Value: strings.Split(tr.alloc.Job.Name, "/dispatch-")[1], 416 }) 417 } 418 if strings.Contains(tr.alloc.Job.Name, "/periodic-") { 419 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 420 Name: "periodic_id", 421 Value: strings.Split(tr.alloc.Job.Name, "/periodic-")[1], 422 }) 423 } 424 } 425 } 426 427 // Mark a task as failed and not to run. Aimed to be invoked when alloc runner 428 // prestart hooks failed. 429 // Should never be called with Run(). 430 func (tr *TaskRunner) MarkFailedDead(reason string) { 431 defer close(tr.waitCh) 432 433 tr.stateLock.Lock() 434 if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { 435 //TODO Nomad will be unable to restore this task; try to kill 436 // it now and fail? In general we prefer to leave running 437 // tasks running even if the agent encounters an error. 438 tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart", 439 "error", err) 440 } 441 tr.stateLock.Unlock() 442 443 event := structs.NewTaskEvent(structs.TaskSetupFailure). 444 SetDisplayMessage(reason). 445 SetFailsTask() 446 tr.UpdateState(structs.TaskStateDead, event) 447 448 // Run the stop hooks in case task was a restored task that failed prestart 449 if err := tr.stop(); err != nil { 450 tr.logger.Error("stop failed while marking task dead", "error", err) 451 } 452 } 453 454 // Run the TaskRunner. Starts the user's task or reattaches to a restored task. 455 // Run closes WaitCh when it exits. Should be started in a goroutine. 456 func (tr *TaskRunner) Run() { 457 defer close(tr.waitCh) 458 var result *drivers.ExitResult 459 460 tr.stateLock.RLock() 461 dead := tr.state.State == structs.TaskStateDead 462 tr.stateLock.RUnlock() 463 464 // if restoring a dead task, ensure that task is cleared and all post hooks 465 // are called without additional state updates 466 if dead { 467 // do cleanup functions without emitting any additional events/work 468 // to handle cases where we restored a dead task where client terminated 469 // after task finished before completing post-run actions. 470 tr.clearDriverHandle() 471 tr.stateUpdater.TaskStateUpdated() 472 if err := tr.stop(); err != nil { 473 tr.logger.Error("stop failed on terminal task", "error", err) 474 } 475 return 476 } 477 478 // Updates are handled asynchronously with the other hooks but each 479 // triggered update - whether due to alloc updates or a new vault token 480 // - should be handled serially. 481 go tr.handleUpdates() 482 483 // If restore failed wait until servers are contacted before running. 484 // #1795 485 if tr.waitOnServers { 486 tr.logger.Info("task failed to restore; waiting to contact server before restarting") 487 select { 488 case <-tr.killCtx.Done(): 489 case <-tr.shutdownCtx.Done(): 490 return 491 case <-tr.serversContactedCh: 492 tr.logger.Info("server contacted; unblocking waiting task") 493 } 494 } 495 496 select { 497 case <-tr.startConditionMetCtx: 498 tr.logger.Debug("lifecycle start condition has been met, proceeding") 499 // yay proceed 500 case <-tr.killCtx.Done(): 501 case <-tr.shutdownCtx.Done(): 502 return 503 } 504 505 MAIN: 506 for !tr.shouldShutdown() { 507 select { 508 case <-tr.killCtx.Done(): 509 break MAIN 510 case <-tr.shutdownCtx.Done(): 511 // TaskRunner was told to exit immediately 512 return 513 default: 514 } 515 516 // Run the prestart hooks 517 if err := tr.prestart(); err != nil { 518 tr.logger.Error("prestart failed", "error", err) 519 tr.restartTracker.SetStartError(err) 520 goto RESTART 521 } 522 523 select { 524 case <-tr.killCtx.Done(): 525 break MAIN 526 case <-tr.shutdownCtx.Done(): 527 // TaskRunner was told to exit immediately 528 return 529 default: 530 } 531 532 // Run the task 533 if err := tr.runDriver(); err != nil { 534 tr.logger.Error("running driver failed", "error", err) 535 tr.restartTracker.SetStartError(err) 536 goto RESTART 537 } 538 539 // Run the poststart hooks 540 if err := tr.poststart(); err != nil { 541 tr.logger.Error("poststart failed", "error", err) 542 } 543 544 // Grab the result proxy and wait for task to exit 545 WAIT: 546 { 547 handle := tr.getDriverHandle() 548 result = nil 549 550 // Do *not* use tr.killCtx here as it would cause 551 // Wait() to unblock before the task exits when Kill() 552 // is called. 553 if resultCh, err := handle.WaitCh(context.Background()); err != nil { 554 tr.logger.Error("wait task failed", "error", err) 555 } else { 556 select { 557 case <-tr.killCtx.Done(): 558 // We can go through the normal should restart check since 559 // the restart tracker knowns it is killed 560 result = tr.handleKill() 561 case <-tr.shutdownCtx.Done(): 562 // TaskRunner was told to exit immediately 563 return 564 case result = <-resultCh: 565 } 566 567 // WaitCh returned a result 568 if retryWait := tr.handleTaskExitResult(result); retryWait { 569 goto WAIT 570 } 571 } 572 } 573 574 // Clear the handle 575 tr.clearDriverHandle() 576 577 // Store the wait result on the restart tracker 578 tr.restartTracker.SetExitResult(result) 579 580 if err := tr.exited(); err != nil { 581 tr.logger.Error("exited hooks failed", "error", err) 582 } 583 584 RESTART: 585 restart, restartDelay := tr.shouldRestart() 586 if !restart { 587 break MAIN 588 } 589 590 // Actually restart by sleeping and also watching for destroy events 591 select { 592 case <-time.After(restartDelay): 593 case <-tr.killCtx.Done(): 594 tr.logger.Trace("task killed between restarts", "delay", restartDelay) 595 break MAIN 596 case <-tr.shutdownCtx.Done(): 597 // TaskRunner was told to exit immediately 598 tr.logger.Trace("gracefully shutting down during restart delay") 599 return 600 } 601 } 602 603 // Ensure handle is cleaned up. Restore could have recovered a task 604 // that should be terminal, so if the handle still exists we should 605 // kill it here. 606 if tr.getDriverHandle() != nil { 607 if result = tr.handleKill(); result != nil { 608 tr.emitExitResultEvent(result) 609 } 610 611 tr.clearDriverHandle() 612 613 if err := tr.exited(); err != nil { 614 tr.logger.Error("exited hooks failed while cleaning up terminal task", "error", err) 615 } 616 } 617 618 // Mark the task as dead 619 tr.UpdateState(structs.TaskStateDead, nil) 620 621 // Run the stop hooks 622 if err := tr.stop(); err != nil { 623 tr.logger.Error("stop failed", "error", err) 624 } 625 626 tr.logger.Debug("task run loop exiting") 627 } 628 629 func (tr *TaskRunner) shouldShutdown() bool { 630 if tr.alloc.ClientTerminalStatus() { 631 return true 632 } 633 634 if !tr.IsPoststopTask() && tr.alloc.ServerTerminalStatus() { 635 return true 636 } 637 638 return false 639 } 640 641 // handleTaskExitResult handles the results returned by the task exiting. If 642 // retryWait is true, the caller should attempt to wait on the task again since 643 // it has not actually finished running. This can happen if the driver plugin 644 // has exited. 645 func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWait bool) { 646 if result == nil { 647 return false 648 } 649 650 if result.Err == bstructs.ErrPluginShutdown { 651 dn := tr.Task().Driver 652 tr.logger.Debug("driver plugin has shutdown; attempting to recover task", "driver", dn) 653 654 // Initialize a new driver handle 655 if err := tr.initDriver(); err != nil { 656 tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err, "driver", dn) 657 return false 658 } 659 660 // Try to restore the handle 661 tr.stateLock.RLock() 662 h := tr.localState.TaskHandle 663 net := tr.localState.DriverNetwork 664 tr.stateLock.RUnlock() 665 if !tr.restoreHandle(h, net) { 666 tr.logger.Error("failed to restore handle on driver after it exited unexpectedly", "driver", dn) 667 return false 668 } 669 670 tr.logger.Debug("task successfully recovered on driver", "driver", dn) 671 return true 672 } 673 674 // Emit Terminated event 675 tr.emitExitResultEvent(result) 676 677 return false 678 } 679 680 // emitExitResultEvent emits a TaskTerminated event for an ExitResult. 681 func (tr *TaskRunner) emitExitResultEvent(result *drivers.ExitResult) { 682 event := structs.NewTaskEvent(structs.TaskTerminated). 683 SetExitCode(result.ExitCode). 684 SetSignal(result.Signal). 685 SetOOMKilled(result.OOMKilled). 686 SetExitMessage(result.Err) 687 688 tr.EmitEvent(event) 689 690 if result.OOMKilled { 691 metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels) 692 } 693 } 694 695 // handleUpdates runs update hooks when triggerUpdateCh is ticked and exits 696 // when Run has returned. Should only be run in a goroutine from Run. 697 func (tr *TaskRunner) handleUpdates() { 698 for { 699 select { 700 case <-tr.triggerUpdateCh: 701 case <-tr.waitCh: 702 return 703 } 704 705 // Non-terminal update; run hooks 706 tr.updateHooks() 707 } 708 } 709 710 // shouldRestart determines whether the task should be restarted and updates 711 // the task state unless the task is killed or terminated. 712 func (tr *TaskRunner) shouldRestart() (bool, time.Duration) { 713 // Determine if we should restart 714 state, when := tr.restartTracker.GetState() 715 reason := tr.restartTracker.GetReason() 716 switch state { 717 case structs.TaskKilled: 718 // Never restart an explicitly killed task. Kill method handles 719 // updating the server. 720 tr.EmitEvent(structs.NewTaskEvent(state)) 721 return false, 0 722 case structs.TaskNotRestarting, structs.TaskTerminated: 723 tr.logger.Info("not restarting task", "reason", reason) 724 if state == structs.TaskNotRestarting { 725 tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting).SetRestartReason(reason).SetFailsTask()) 726 } 727 return false, 0 728 case structs.TaskRestarting: 729 tr.logger.Info("restarting task", "reason", reason, "delay", when) 730 tr.UpdateState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when).SetRestartReason(reason)) 731 return true, when 732 default: 733 tr.logger.Error("restart tracker returned unknown state", "state", state) 734 return true, when 735 } 736 } 737 738 // runDriver runs the driver and waits for it to exit 739 // runDriver emits an appropriate task event on success/failure 740 func (tr *TaskRunner) runDriver() error { 741 742 taskConfig := tr.buildTaskConfig() 743 744 // Build hcl context variables 745 vars, errs, err := tr.envBuilder.Build().AllValues() 746 if err != nil { 747 return fmt.Errorf("error building environment variables: %v", err) 748 } 749 750 // Handle per-key errors 751 if len(errs) > 0 { 752 keys := make([]string, 0, len(errs)) 753 for k, err := range errs { 754 keys = append(keys, k) 755 756 if tr.logger.IsTrace() { 757 // Verbosely log every diagnostic for debugging 758 tr.logger.Trace("error building environment variables", "key", k, "error", err) 759 } 760 } 761 762 tr.logger.Warn("some environment variables not available for rendering", "keys", strings.Join(keys, ", ")) 763 } 764 765 val, diag, diagErrs := hclutils.ParseHclInterface(tr.task.Config, tr.taskSchema, vars) 766 if diag.HasErrors() { 767 parseErr := multierror.Append(errors.New("failed to parse config: "), diagErrs...) 768 tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(parseErr)) 769 return parseErr 770 } 771 772 if err := taskConfig.EncodeDriverConfig(val); err != nil { 773 encodeErr := fmt.Errorf("failed to encode driver config: %v", err) 774 tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(encodeErr)) 775 return encodeErr 776 } 777 778 // If there's already a task handle (eg from a Restore) there's nothing 779 // to do except update state. 780 if tr.getDriverHandle() != nil { 781 // Ensure running state is persisted but do *not* append a new 782 // task event as restoring is a client event and not relevant 783 // to a task's lifecycle. 784 if err := tr.updateStateImpl(structs.TaskStateRunning); err != nil { 785 //TODO return error and destroy task to avoid an orphaned task? 786 tr.logger.Warn("error persisting task state", "error", err) 787 } 788 return nil 789 } 790 791 // Start the job if there's no existing handle (or if RecoverTask failed) 792 handle, net, err := tr.driver.StartTask(taskConfig) 793 if err != nil { 794 // The plugin has died, try relaunching it 795 if err == bstructs.ErrPluginShutdown { 796 tr.logger.Info("failed to start task because plugin shutdown unexpectedly; attempting to recover") 797 if err := tr.initDriver(); err != nil { 798 taskErr := fmt.Errorf("failed to initialize driver after it exited unexpectedly: %v", err) 799 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr)) 800 return taskErr 801 } 802 803 handle, net, err = tr.driver.StartTask(taskConfig) 804 if err != nil { 805 taskErr := fmt.Errorf("failed to start task after driver exited unexpectedly: %v", err) 806 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr)) 807 return taskErr 808 } 809 } else { 810 // Do *NOT* wrap the error here without maintaining whether or not is Recoverable. 811 // You must emit a task event failure to be considered Recoverable 812 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err)) 813 return err 814 } 815 } 816 817 tr.stateLock.Lock() 818 tr.localState.TaskHandle = handle 819 tr.localState.DriverNetwork = net 820 if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { 821 //TODO Nomad will be unable to restore this task; try to kill 822 // it now and fail? In general we prefer to leave running 823 // tasks running even if the agent encounters an error. 824 tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart", 825 "error", err, "task_id", handle.Config.ID) 826 } 827 tr.stateLock.Unlock() 828 829 tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) 830 831 // Emit an event that we started 832 tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 833 return nil 834 } 835 836 // initDriver retrives the DriverPlugin from the plugin loader for this task 837 func (tr *TaskRunner) initDriver() error { 838 driver, err := tr.driverManager.Dispense(tr.Task().Driver) 839 if err != nil { 840 return err 841 } 842 tr.driver = driver 843 844 schema, err := tr.driver.TaskConfigSchema() 845 if err != nil { 846 return err 847 } 848 spec, diag := hclspecutils.Convert(schema) 849 if diag.HasErrors() { 850 return multierror.Append(errors.New("failed to convert task schema"), diag.Errs()...) 851 } 852 tr.taskSchema = spec 853 854 caps, err := tr.driver.Capabilities() 855 if err != nil { 856 return err 857 } 858 tr.driverCapabilities = caps 859 860 return nil 861 } 862 863 // handleKill is used to handle the a request to kill a task. It will return 864 // the handle exit result if one is available and store any error in the task 865 // runner killErr value. 866 func (tr *TaskRunner) handleKill() *drivers.ExitResult { 867 // Run the pre killing hooks 868 tr.preKill() 869 870 // Wait for task ShutdownDelay after running prekill hooks 871 // This allows for things like service de-registration to run 872 // before waiting to kill task 873 if delay := tr.Task().ShutdownDelay; delay != 0 { 874 tr.logger.Debug("waiting before killing task", "shutdown_delay", delay) 875 time.Sleep(delay) 876 } 877 878 // Tell the restart tracker that the task has been killed so it doesn't 879 // attempt to restart it. 880 tr.restartTracker.SetKilled() 881 882 // Check it is running 883 handle := tr.getDriverHandle() 884 if handle == nil { 885 return nil 886 } 887 888 // Kill the task using an exponential backoff in-case of failures. 889 killErr := tr.killTask(handle) 890 if killErr != nil { 891 // We couldn't successfully destroy the resource created. 892 tr.logger.Error("failed to kill task. Resources may have been leaked", "error", killErr) 893 tr.setKillErr(killErr) 894 } 895 896 // Block until task has exited. 897 waitCh, err := handle.WaitCh(tr.shutdownCtx) 898 899 // The error should be nil or TaskNotFound, if it's something else then a 900 // failure in the driver or transport layer occurred 901 if err != nil { 902 if err == drivers.ErrTaskNotFound { 903 return nil 904 } 905 tr.logger.Error("failed to wait on task. Resources may have been leaked", "error", err) 906 tr.setKillErr(killErr) 907 return nil 908 } 909 910 select { 911 case result := <-waitCh: 912 return result 913 case <-tr.shutdownCtx.Done(): 914 return nil 915 } 916 } 917 918 // killTask kills the task handle. In the case that killing fails, 919 // killTask will retry with an exponential backoff and will give up at a 920 // given limit. Returns an error if the task could not be killed. 921 func (tr *TaskRunner) killTask(handle *DriverHandle) error { 922 // Cap the number of times we attempt to kill the task. 923 var err error 924 for i := 0; i < killFailureLimit; i++ { 925 if err = handle.Kill(); err != nil { 926 if err == drivers.ErrTaskNotFound { 927 tr.logger.Warn("couldn't find task to kill", "task_id", handle.ID()) 928 return nil 929 } 930 // Calculate the new backoff 931 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 932 if backoff > killBackoffLimit { 933 backoff = killBackoffLimit 934 } 935 936 tr.logger.Error("failed to kill task", "backoff", backoff, "error", err) 937 time.Sleep(backoff) 938 } else { 939 // Kill was successful 940 return nil 941 } 942 } 943 return err 944 } 945 946 // persistLocalState persists local state to disk synchronously. 947 func (tr *TaskRunner) persistLocalState() error { 948 tr.stateLock.RLock() 949 defer tr.stateLock.RUnlock() 950 951 return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState) 952 } 953 954 // buildTaskConfig builds a drivers.TaskConfig with an unique ID for the task. 955 // The ID is unique for every invocation, it is built from the alloc ID, task 956 // name and 8 random characters. 957 func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig { 958 task := tr.Task() 959 alloc := tr.Alloc() 960 invocationid := uuid.Generate()[:8] 961 taskResources := tr.taskResources 962 ports := tr.Alloc().AllocatedResources.Shared.Ports 963 env := tr.envBuilder.Build() 964 tr.networkIsolationLock.Lock() 965 defer tr.networkIsolationLock.Unlock() 966 967 var dns *drivers.DNSConfig 968 if alloc.AllocatedResources != nil && len(alloc.AllocatedResources.Shared.Networks) > 0 { 969 allocDNS := alloc.AllocatedResources.Shared.Networks[0].DNS 970 if allocDNS != nil { 971 dns = &drivers.DNSConfig{ 972 Servers: allocDNS.Servers, 973 Searches: allocDNS.Searches, 974 Options: allocDNS.Options, 975 } 976 } 977 } 978 979 return &drivers.TaskConfig{ 980 ID: fmt.Sprintf("%s/%s/%s", alloc.ID, task.Name, invocationid), 981 Name: task.Name, 982 JobName: alloc.Job.Name, 983 TaskGroupName: alloc.TaskGroup, 984 Resources: &drivers.Resources{ 985 NomadResources: taskResources, 986 LinuxResources: &drivers.LinuxResources{ 987 MemoryLimitBytes: taskResources.Memory.MemoryMB * 1024 * 1024, 988 CPUShares: taskResources.Cpu.CpuShares, 989 PercentTicks: float64(taskResources.Cpu.CpuShares) / float64(tr.clientConfig.Node.NodeResources.Cpu.CpuShares), 990 }, 991 Ports: &ports, 992 }, 993 Devices: tr.hookResources.getDevices(), 994 Mounts: tr.hookResources.getMounts(), 995 Env: env.Map(), 996 DeviceEnv: env.DeviceEnv(), 997 User: task.User, 998 AllocDir: tr.taskDir.AllocDir, 999 StdoutPath: tr.logmonHookConfig.stdoutFifo, 1000 StderrPath: tr.logmonHookConfig.stderrFifo, 1001 AllocID: tr.allocID, 1002 NetworkIsolation: tr.networkIsolationSpec, 1003 DNS: dns, 1004 } 1005 } 1006 1007 // Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner 1008 // but before Run so no locks need to be acquired. 1009 func (tr *TaskRunner) Restore() error { 1010 ls, ts, err := tr.stateDB.GetTaskRunnerState(tr.allocID, tr.taskName) 1011 if err != nil { 1012 return err 1013 } 1014 1015 if ls != nil { 1016 ls.Canonicalize() 1017 tr.localState = ls 1018 } 1019 1020 if ts != nil { 1021 ts.Canonicalize() 1022 tr.state = ts 1023 } 1024 1025 // If a TaskHandle was persisted, ensure it is valid or destroy it. 1026 if taskHandle := tr.localState.TaskHandle; taskHandle != nil { 1027 //TODO if RecoverTask returned the DriverNetwork we wouldn't 1028 // have to persist it at all! 1029 restored := tr.restoreHandle(taskHandle, tr.localState.DriverNetwork) 1030 1031 // If the handle could not be restored, the alloc is 1032 // non-terminal, and the task isn't a system job: wait until 1033 // servers have been contacted before running. #1795 1034 if restored { 1035 return nil 1036 } 1037 1038 alloc := tr.Alloc() 1039 if tr.state.State == structs.TaskStateDead || alloc.TerminalStatus() || alloc.Job.Type == structs.JobTypeSystem { 1040 return nil 1041 } 1042 1043 tr.logger.Trace("failed to reattach to task; will not run until server is contacted") 1044 tr.waitOnServers = true 1045 1046 ev := structs.NewTaskEvent(structs.TaskRestoreFailed). 1047 SetDisplayMessage("failed to restore task; will not run until server is contacted") 1048 tr.UpdateState(structs.TaskStatePending, ev) 1049 } 1050 1051 return nil 1052 } 1053 1054 // restoreHandle ensures a TaskHandle is valid by calling Driver.RecoverTask 1055 // and sets the driver handle. If the TaskHandle is not valid, DestroyTask is 1056 // called. 1057 func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers.DriverNetwork) (success bool) { 1058 // Ensure handle is well-formed 1059 if taskHandle.Config == nil { 1060 return true 1061 } 1062 1063 if err := tr.driver.RecoverTask(taskHandle); err != nil { 1064 if tr.TaskState().State != structs.TaskStateRunning { 1065 // RecoverTask should fail if the Task wasn't running 1066 return true 1067 } 1068 1069 tr.logger.Error("error recovering task; cleaning up", 1070 "error", err, "task_id", taskHandle.Config.ID) 1071 1072 // Try to cleanup any existing task state in the plugin before restarting 1073 if err := tr.driver.DestroyTask(taskHandle.Config.ID, true); err != nil { 1074 // Ignore ErrTaskNotFound errors as ideally 1075 // this task has already been stopped and 1076 // therefore doesn't exist. 1077 if err != drivers.ErrTaskNotFound { 1078 tr.logger.Warn("error destroying unrecoverable task", 1079 "error", err, "task_id", taskHandle.Config.ID) 1080 } 1081 1082 return false 1083 } 1084 1085 return true 1086 } 1087 1088 // Update driver handle on task runner 1089 tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net)) 1090 return true 1091 } 1092 1093 // UpdateState sets the task runners allocation state and triggers a server 1094 // update. 1095 func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) { 1096 tr.stateLock.Lock() 1097 defer tr.stateLock.Unlock() 1098 1099 if event != nil { 1100 tr.logger.Trace("setting task state", "state", state, "event", event.Type) 1101 1102 // Append the event 1103 tr.appendEvent(event) 1104 } 1105 1106 // Update the state 1107 if err := tr.updateStateImpl(state); err != nil { 1108 // Only log the error as we persistence errors should not 1109 // affect task state. 1110 tr.logger.Error("error persisting task state", "error", err, "event", event, "state", state) 1111 } 1112 1113 // Notify the alloc runner of the transition 1114 tr.stateUpdater.TaskStateUpdated() 1115 } 1116 1117 // updateStateImpl updates the in-memory task state and persists to disk. 1118 func (tr *TaskRunner) updateStateImpl(state string) error { 1119 1120 // Update the task state 1121 oldState := tr.state.State 1122 taskState := tr.state 1123 taskState.State = state 1124 1125 // Handle the state transition. 1126 switch state { 1127 case structs.TaskStateRunning: 1128 // Capture the start time if it is just starting 1129 if oldState != structs.TaskStateRunning { 1130 taskState.StartedAt = time.Now().UTC() 1131 metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels) 1132 } 1133 case structs.TaskStateDead: 1134 // Capture the finished time if not already set 1135 if taskState.FinishedAt.IsZero() { 1136 taskState.FinishedAt = time.Now().UTC() 1137 } 1138 1139 // Emitting metrics to indicate task complete and failures 1140 if taskState.Failed { 1141 metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels) 1142 } else { 1143 metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, 1, tr.baseLabels) 1144 } 1145 } 1146 1147 // Persist the state and event 1148 return tr.stateDB.PutTaskState(tr.allocID, tr.taskName, taskState) 1149 } 1150 1151 // EmitEvent appends a new TaskEvent to this task's TaskState. The actual 1152 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1153 // transition states. 1154 // Events are persisted locally and sent to the server, but errors are simply 1155 // logged. Use AppendEvent to simply add a new event. 1156 func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) { 1157 tr.stateLock.Lock() 1158 defer tr.stateLock.Unlock() 1159 1160 tr.appendEvent(event) 1161 1162 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1163 // Only a warning because the next event/state-transition will 1164 // try to persist it again. 1165 tr.logger.Warn("error persisting event", "error", err, "event", event) 1166 } 1167 1168 // Notify the alloc runner of the event 1169 tr.stateUpdater.TaskStateUpdated() 1170 } 1171 1172 // AppendEvent appends a new TaskEvent to this task's TaskState. The actual 1173 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1174 // transition states. 1175 // Events are persisted locally and errors are simply logged. Use EmitEvent 1176 // also update AllocRunner. 1177 func (tr *TaskRunner) AppendEvent(event *structs.TaskEvent) { 1178 tr.stateLock.Lock() 1179 defer tr.stateLock.Unlock() 1180 1181 tr.appendEvent(event) 1182 1183 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1184 // Only a warning because the next event/state-transition will 1185 // try to persist it again. 1186 tr.logger.Warn("error persisting event", "error", err, "event", event) 1187 } 1188 } 1189 1190 // appendEvent to task's event slice. Caller must acquire stateLock. 1191 func (tr *TaskRunner) appendEvent(event *structs.TaskEvent) error { 1192 // Ensure the event is populated with human readable strings 1193 event.PopulateEventDisplayMessage() 1194 1195 // Propagate failure from event to task state 1196 if event.FailsTask { 1197 tr.state.Failed = true 1198 } 1199 1200 // XXX This seems like a super awkward spot for this? Why not shouldRestart? 1201 // Update restart metrics 1202 if event.Type == structs.TaskRestarting { 1203 metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels) 1204 tr.state.Restarts++ 1205 tr.state.LastRestart = time.Unix(0, event.Time) 1206 } 1207 1208 // Append event to slice 1209 appendTaskEvent(tr.state, event, tr.maxEvents) 1210 1211 return nil 1212 } 1213 1214 // WaitCh is closed when TaskRunner.Run exits. 1215 func (tr *TaskRunner) WaitCh() <-chan struct{} { 1216 return tr.waitCh 1217 } 1218 1219 // Update the running allocation with a new version received from the server. 1220 // Calls Update hooks asynchronously with Run. 1221 // 1222 // This method is safe for calling concurrently with Run and does not modify 1223 // the passed in allocation. 1224 func (tr *TaskRunner) Update(update *structs.Allocation) { 1225 task := update.LookupTask(tr.taskName) 1226 if task == nil { 1227 // This should not happen and likely indicates a bug in the 1228 // server or client. 1229 tr.logger.Error("allocation update is missing task; killing", 1230 "group", update.TaskGroup) 1231 te := structs.NewTaskEvent(structs.TaskKilled). 1232 SetKillReason("update missing task"). 1233 SetFailsTask() 1234 tr.Kill(context.Background(), te) 1235 return 1236 } 1237 1238 // Update tr.alloc 1239 tr.setAlloc(update, task) 1240 1241 // Trigger update hooks if not terminal 1242 if !update.TerminalStatus() { 1243 tr.triggerUpdateHooks() 1244 } 1245 } 1246 1247 // SetNetworkIsolation is called by the PreRun allocation hook after configuring 1248 // the network isolation for the allocation 1249 func (tr *TaskRunner) SetNetworkIsolation(n *drivers.NetworkIsolationSpec) { 1250 tr.networkIsolationLock.Lock() 1251 tr.networkIsolationSpec = n 1252 tr.networkIsolationLock.Unlock() 1253 } 1254 1255 // triggerUpdate if there isn't already an update pending. Should be called 1256 // instead of calling updateHooks directly to serialize runs of update hooks. 1257 // TaskRunner state should be updated prior to triggering update hooks. 1258 // 1259 // Does not block. 1260 func (tr *TaskRunner) triggerUpdateHooks() { 1261 select { 1262 case tr.triggerUpdateCh <- struct{}{}: 1263 default: 1264 // already an update hook pending 1265 } 1266 } 1267 1268 // Shutdown TaskRunner gracefully without affecting the state of the task. 1269 // Shutdown blocks until the main Run loop exits. 1270 func (tr *TaskRunner) Shutdown() { 1271 tr.logger.Trace("shutting down") 1272 tr.shutdownCtxCancel() 1273 1274 <-tr.WaitCh() 1275 1276 // Run shutdown hooks to cleanup 1277 tr.shutdownHooks() 1278 1279 // Persist once more 1280 tr.persistLocalState() 1281 } 1282 1283 // LatestResourceUsage returns the last resource utilization datapoint 1284 // collected. May return nil if the task is not running or no resource 1285 // utilization has been collected yet. 1286 func (tr *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1287 tr.resourceUsageLock.Lock() 1288 ru := tr.resourceUsage 1289 tr.resourceUsageLock.Unlock() 1290 1291 // Look up device statistics lazily when fetched, as currently we do not emit any stats for them yet 1292 if ru != nil && tr.deviceStatsReporter != nil { 1293 deviceResources := tr.taskResources.Devices 1294 ru.ResourceUsage.DeviceStats = tr.deviceStatsReporter.LatestDeviceResourceStats(deviceResources) 1295 } 1296 return ru 1297 } 1298 1299 // UpdateStats updates and emits the latest stats from the driver. 1300 func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) { 1301 tr.resourceUsageLock.Lock() 1302 tr.resourceUsage = ru 1303 tr.resourceUsageLock.Unlock() 1304 if ru != nil { 1305 tr.emitStats(ru) 1306 } 1307 } 1308 1309 //TODO Remove Backwardscompat or use tr.Alloc()? 1310 func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { 1311 alloc := tr.Alloc() 1312 var allocatedMem float32 1313 if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { 1314 // Convert to bytes to match other memory metrics 1315 allocatedMem = float32(taskRes.Memory.MemoryMB) * 1024 * 1024 1316 } 1317 1318 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1319 float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels) 1320 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"}, 1321 float32(ru.ResourceUsage.MemoryStats.Cache), tr.baseLabels) 1322 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"}, 1323 float32(ru.ResourceUsage.MemoryStats.Swap), tr.baseLabels) 1324 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "usage"}, 1325 float32(ru.ResourceUsage.MemoryStats.Usage), tr.baseLabels) 1326 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"}, 1327 float32(ru.ResourceUsage.MemoryStats.MaxUsage), tr.baseLabels) 1328 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"}, 1329 float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels) 1330 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, 1331 float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels) 1332 if allocatedMem > 0 { 1333 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "allocated"}, 1334 allocatedMem, tr.baseLabels) 1335 } 1336 } 1337 1338 //TODO Remove Backwardscompat or use tr.Alloc()? 1339 func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) { 1340 alloc := tr.Alloc() 1341 var allocatedCPU float32 1342 if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { 1343 allocatedCPU = float32(taskRes.Cpu.CpuShares) 1344 } 1345 1346 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"}, 1347 float32(ru.ResourceUsage.CpuStats.Percent), tr.baseLabels) 1348 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"}, 1349 float32(ru.ResourceUsage.CpuStats.SystemMode), tr.baseLabels) 1350 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"}, 1351 float32(ru.ResourceUsage.CpuStats.UserMode), tr.baseLabels) 1352 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"}, 1353 float32(ru.ResourceUsage.CpuStats.ThrottledTime), tr.baseLabels) 1354 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"}, 1355 float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels) 1356 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"}, 1357 float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels) 1358 if allocatedCPU > 0 { 1359 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "allocated"}, 1360 allocatedCPU, tr.baseLabels) 1361 } 1362 } 1363 1364 // emitStats emits resource usage stats of tasks to remote metrics collector 1365 // sinks 1366 func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1367 if !tr.clientConfig.PublishAllocationMetrics { 1368 return 1369 } 1370 1371 if ru.ResourceUsage.MemoryStats != nil { 1372 tr.setGaugeForMemory(ru) 1373 } else { 1374 tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil") 1375 } 1376 1377 if ru.ResourceUsage.CpuStats != nil { 1378 tr.setGaugeForCPU(ru) 1379 } else { 1380 tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil") 1381 } 1382 } 1383 1384 // appendTaskEvent updates the task status by appending the new event. 1385 func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent, capacity int) { 1386 if state.Events == nil { 1387 state.Events = make([]*structs.TaskEvent, 1, capacity) 1388 state.Events[0] = event 1389 return 1390 } 1391 1392 // If we hit capacity, then shift it. 1393 if len(state.Events) == capacity { 1394 old := state.Events 1395 state.Events = make([]*structs.TaskEvent, 0, capacity) 1396 state.Events = append(state.Events, old[1:]...) 1397 } 1398 1399 state.Events = append(state.Events, event) 1400 } 1401 1402 func (tr *TaskRunner) TaskExecHandler() drivermanager.TaskExecHandler { 1403 // Check it is running 1404 handle := tr.getDriverHandle() 1405 if handle == nil { 1406 return nil 1407 } 1408 return handle.ExecStreaming 1409 } 1410 1411 func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) { 1412 return tr.driver.Capabilities() 1413 } 1414 1415 func (tr *TaskRunner) SetAllocHookResources(res *cstructs.AllocHookResources) { 1416 tr.allocHookResources = res 1417 }