github.com/bigcommerce/nomad@v0.9.3-bc/client/allocrunner/taskrunner/task_runner.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "strings" 8 "sync" 9 "time" 10 11 metrics "github.com/armon/go-metrics" 12 log "github.com/hashicorp/go-hclog" 13 multierror "github.com/hashicorp/go-multierror" 14 "github.com/hashicorp/hcl2/hcldec" 15 "github.com/hashicorp/nomad/client/allocdir" 16 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 17 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts" 18 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" 19 "github.com/hashicorp/nomad/client/config" 20 "github.com/hashicorp/nomad/client/consul" 21 "github.com/hashicorp/nomad/client/devicemanager" 22 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 23 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 24 cstate "github.com/hashicorp/nomad/client/state" 25 cstructs "github.com/hashicorp/nomad/client/structs" 26 "github.com/hashicorp/nomad/client/taskenv" 27 "github.com/hashicorp/nomad/client/vaultclient" 28 "github.com/hashicorp/nomad/helper/pluginutils/hclspecutils" 29 "github.com/hashicorp/nomad/helper/pluginutils/hclutils" 30 "github.com/hashicorp/nomad/helper/uuid" 31 "github.com/hashicorp/nomad/nomad/structs" 32 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 33 "github.com/hashicorp/nomad/plugins/drivers" 34 ) 35 36 const ( 37 // defaultMaxEvents is the default max capacity for task events on the 38 // task state. Overrideable for testing. 39 defaultMaxEvents = 10 40 41 // killBackoffBaseline is the baseline time for exponential backoff while 42 // killing a task. 43 killBackoffBaseline = 5 * time.Second 44 45 // killBackoffLimit is the limit of the exponential backoff for killing 46 // the task. 47 killBackoffLimit = 2 * time.Minute 48 49 // killFailureLimit is how many times we will attempt to kill a task before 50 // giving up and potentially leaking resources. 51 killFailureLimit = 5 52 53 // triggerUpdatechCap is the capacity for the triggerUpdateCh used for 54 // triggering updates. It should be exactly 1 as even if multiple 55 // updates have come in since the last one was handled, we only need to 56 // handle the last one. 57 triggerUpdateChCap = 1 58 ) 59 60 type TaskRunner struct { 61 // allocID, taskName, taskLeader, and taskResources are immutable so these fields may 62 // be accessed without locks 63 allocID string 64 taskName string 65 taskLeader bool 66 taskResources *structs.AllocatedTaskResources 67 68 alloc *structs.Allocation 69 allocLock sync.Mutex 70 71 clientConfig *config.Config 72 73 // stateUpdater is used to emit updated task state 74 stateUpdater interfaces.TaskStateHandler 75 76 // state captures the state of the task for updating the allocation 77 // Must acquire stateLock to access. 78 state *structs.TaskState 79 80 // localState captures the node-local state of the task for when the 81 // Nomad agent restarts. 82 // Must acquire stateLock to access. 83 localState *state.LocalState 84 85 // stateLock must be acquired when accessing state or localState. 86 stateLock sync.RWMutex 87 88 // stateDB is for persisting localState and taskState 89 stateDB cstate.StateDB 90 91 // shutdownCtx is used to exit the TaskRunner *without* affecting task state. 92 shutdownCtx context.Context 93 94 // shutdownCtxCancel causes the TaskRunner to exit immediately without 95 // affecting task state. Useful for testing or graceful agent shutdown. 96 shutdownCtxCancel context.CancelFunc 97 98 // killCtx is the task runner's context representing the tasks's lifecycle. 99 // The context is canceled when the task is killed. 100 killCtx context.Context 101 102 // killCtxCancel is called when killing a task. 103 killCtxCancel context.CancelFunc 104 105 // killErr is populated when killing a task. Access should be done use the 106 // getter/setter 107 killErr error 108 killErrLock sync.Mutex 109 110 // Logger is the logger for the task runner. 111 logger log.Logger 112 113 // triggerUpdateCh is ticked whenever update hooks need to be run and 114 // must be created with cap=1 to signal a pending update and prevent 115 // callers from deadlocking if the receiver has exited. 116 triggerUpdateCh chan struct{} 117 118 // waitCh is closed when the task runner has transitioned to a terminal 119 // state 120 waitCh chan struct{} 121 122 // driver is the driver for the task. 123 driver drivers.DriverPlugin 124 125 // driverCapabilities is the set capabilities the driver supports 126 driverCapabilities *drivers.Capabilities 127 128 // taskSchema is the hcl spec for the task driver configuration 129 taskSchema hcldec.Spec 130 131 // handleLock guards access to handle and handleResult 132 handleLock sync.Mutex 133 134 // handle to the running driver 135 handle *DriverHandle 136 137 // task is the task being run 138 task *structs.Task 139 taskLock sync.RWMutex 140 141 // taskDir is the directory structure for this task. 142 taskDir *allocdir.TaskDir 143 144 // envBuilder is used to build the task's environment 145 envBuilder *taskenv.Builder 146 147 // restartTracker is used to decide if the task should be restarted. 148 restartTracker *restarts.RestartTracker 149 150 // runnerHooks are task runner lifecycle hooks that should be run on state 151 // transistions. 152 runnerHooks []interfaces.TaskHook 153 154 // hookResources captures the resources provided by hooks 155 hookResources *hookResources 156 157 // consulClient is the client used by the consul service hook for 158 // registering services and checks 159 consulClient consul.ConsulServiceAPI 160 161 // vaultClient is the client to use to derive and renew Vault tokens 162 vaultClient vaultclient.VaultClient 163 164 // vaultToken is the current Vault token. It should be accessed with the 165 // getter. 166 vaultToken string 167 vaultTokenLock sync.Mutex 168 169 // baseLabels are used when emitting tagged metrics. All task runner metrics 170 // will have these tags, and optionally more. 171 baseLabels []metrics.Label 172 173 // logmonHookConfig is used to get the paths to the stdout and stderr fifos 174 // to be passed to the driver for task logging 175 logmonHookConfig *logmonHookConfig 176 177 // resourceUsage is written via UpdateStats and read via 178 // LatestResourceUsage. May be nil at all times. 179 resourceUsage *cstructs.TaskResourceUsage 180 resourceUsageLock sync.Mutex 181 182 // deviceStatsReporter is used to lookup resource usage for alloc devices 183 deviceStatsReporter cinterfaces.DeviceStatsReporter 184 185 // devicemanager is used to mount devices as well as lookup device 186 // statistics 187 devicemanager devicemanager.Manager 188 189 // driverManager is used to dispense driver plugins and register event 190 // handlers 191 driverManager drivermanager.Manager 192 193 // maxEvents is the capacity of the TaskEvents on the TaskState. 194 // Defaults to defaultMaxEvents but overrideable for testing. 195 maxEvents int 196 197 // serversContactedCh is passed to TaskRunners so they can detect when 198 // GetClientAllocs has been called in case of a failed restore. 199 serversContactedCh <-chan struct{} 200 201 // waitOnServers defaults to false but will be set true if a restore 202 // fails and the Run method should wait until serversContactedCh is 203 // closed. 204 waitOnServers bool 205 } 206 207 type Config struct { 208 Alloc *structs.Allocation 209 ClientConfig *config.Config 210 Consul consul.ConsulServiceAPI 211 Task *structs.Task 212 TaskDir *allocdir.TaskDir 213 Logger log.Logger 214 215 // Vault is the client to use to derive and renew Vault tokens 216 Vault vaultclient.VaultClient 217 218 // StateDB is used to store and restore state. 219 StateDB cstate.StateDB 220 221 // StateUpdater is used to emit updated task state 222 StateUpdater interfaces.TaskStateHandler 223 224 // deviceStatsReporter is used to lookup resource usage for alloc devices 225 DeviceStatsReporter cinterfaces.DeviceStatsReporter 226 227 // DeviceManager is used to mount devices as well as lookup device 228 // statistics 229 DeviceManager devicemanager.Manager 230 231 // DriverManager is used to dispense driver plugins and register event 232 // handlers 233 DriverManager drivermanager.Manager 234 235 // ServersContactedCh is closed when the first GetClientAllocs call to 236 // servers succeeds and allocs are synced. 237 ServersContactedCh chan struct{} 238 } 239 240 func NewTaskRunner(config *Config) (*TaskRunner, error) { 241 // Create a context for causing the runner to exit 242 trCtx, trCancel := context.WithCancel(context.Background()) 243 244 // Create a context for killing the runner 245 killCtx, killCancel := context.WithCancel(context.Background()) 246 247 // Initialize the environment builder 248 envBuilder := taskenv.NewBuilder( 249 config.ClientConfig.Node, 250 config.Alloc, 251 config.Task, 252 config.ClientConfig.Region, 253 ) 254 255 // Initialize state from alloc if it is set 256 tstate := structs.NewTaskState() 257 if ts := config.Alloc.TaskStates[config.Task.Name]; ts != nil { 258 tstate = ts.Copy() 259 } 260 261 tr := &TaskRunner{ 262 alloc: config.Alloc, 263 allocID: config.Alloc.ID, 264 clientConfig: config.ClientConfig, 265 task: config.Task, 266 taskDir: config.TaskDir, 267 taskName: config.Task.Name, 268 taskLeader: config.Task.Leader, 269 envBuilder: envBuilder, 270 consulClient: config.Consul, 271 vaultClient: config.Vault, 272 state: tstate, 273 localState: state.NewLocalState(), 274 stateDB: config.StateDB, 275 stateUpdater: config.StateUpdater, 276 deviceStatsReporter: config.DeviceStatsReporter, 277 killCtx: killCtx, 278 killCtxCancel: killCancel, 279 shutdownCtx: trCtx, 280 shutdownCtxCancel: trCancel, 281 triggerUpdateCh: make(chan struct{}, triggerUpdateChCap), 282 waitCh: make(chan struct{}), 283 devicemanager: config.DeviceManager, 284 driverManager: config.DriverManager, 285 maxEvents: defaultMaxEvents, 286 serversContactedCh: config.ServersContactedCh, 287 } 288 289 // Create the logger based on the allocation ID 290 tr.logger = config.Logger.Named("task_runner").With("task", config.Task.Name) 291 292 // Pull out the task's resources 293 ares := tr.alloc.AllocatedResources 294 if ares != nil { 295 tres, ok := ares.Tasks[tr.taskName] 296 if !ok { 297 return nil, fmt.Errorf("no task resources found on allocation") 298 } 299 tr.taskResources = tres 300 } else { 301 // COMPAT(0.10): Upgrade from old resources to new resources 302 // Grab the old task resources 303 oldTr, ok := tr.alloc.TaskResources[tr.taskName] 304 if !ok { 305 return nil, fmt.Errorf("no task resources found on allocation") 306 } 307 308 // Convert the old to new 309 tr.taskResources = &structs.AllocatedTaskResources{ 310 Cpu: structs.AllocatedCpuResources{ 311 CpuShares: int64(oldTr.CPU), 312 }, 313 Memory: structs.AllocatedMemoryResources{ 314 MemoryMB: int64(oldTr.MemoryMB), 315 }, 316 Networks: oldTr.Networks, 317 } 318 } 319 320 // Build the restart tracker. 321 tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup) 322 if tg == nil { 323 tr.logger.Error("alloc missing task group") 324 return nil, fmt.Errorf("alloc missing task group") 325 } 326 tr.restartTracker = restarts.NewRestartTracker(tg.RestartPolicy, tr.alloc.Job.Type) 327 328 // Get the driver 329 if err := tr.initDriver(); err != nil { 330 tr.logger.Error("failed to create driver", "error", err) 331 return nil, err 332 } 333 334 // Initialize the runners hooks. 335 tr.initHooks() 336 337 // Initialize base labels 338 tr.initLabels() 339 340 // Initialize initial task received event 341 tr.appendEvent(structs.NewTaskEvent(structs.TaskReceived)) 342 343 return tr, nil 344 } 345 346 func (tr *TaskRunner) initLabels() { 347 alloc := tr.Alloc() 348 tr.baseLabels = []metrics.Label{ 349 { 350 Name: "job", 351 Value: alloc.Job.Name, 352 }, 353 { 354 Name: "task_group", 355 Value: alloc.TaskGroup, 356 }, 357 { 358 Name: "alloc_id", 359 Value: tr.allocID, 360 }, 361 { 362 Name: "task", 363 Value: tr.taskName, 364 }, 365 } 366 367 if tr.alloc.Job.ParentID != "" { 368 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 369 Name: "parent_id", 370 Value: tr.alloc.Job.ParentID, 371 }) 372 if strings.Contains(tr.alloc.Job.Name, "/dispatch-") { 373 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 374 Name: "dispatch_id", 375 Value: strings.Split(tr.alloc.Job.Name, "/dispatch-")[1], 376 }) 377 } 378 if strings.Contains(tr.alloc.Job.Name, "/periodic-") { 379 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 380 Name: "periodic_id", 381 Value: strings.Split(tr.alloc.Job.Name, "/periodic-")[1], 382 }) 383 } 384 } 385 } 386 387 // Mark a task as failed and not to run. Aimed to be invoked when alloc runner 388 // prestart hooks failed. 389 // Should never be called with Run(). 390 func (tr *TaskRunner) MarkFailedDead(reason string) { 391 defer close(tr.waitCh) 392 393 tr.stateLock.Lock() 394 if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { 395 //TODO Nomad will be unable to restore this task; try to kill 396 // it now and fail? In general we prefer to leave running 397 // tasks running even if the agent encounters an error. 398 tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart", 399 "error", err) 400 } 401 tr.stateLock.Unlock() 402 403 event := structs.NewTaskEvent(structs.TaskSetupFailure). 404 SetDisplayMessage(reason). 405 SetFailsTask() 406 tr.UpdateState(structs.TaskStateDead, event) 407 408 // Run the stop hooks in case task was a restored task that failed prestart 409 if err := tr.stop(); err != nil { 410 tr.logger.Error("stop failed while marking task dead", "error", err) 411 } 412 } 413 414 // Run the TaskRunner. Starts the user's task or reattaches to a restored task. 415 // Run closes WaitCh when it exits. Should be started in a goroutine. 416 func (tr *TaskRunner) Run() { 417 defer close(tr.waitCh) 418 var result *drivers.ExitResult 419 420 tr.stateLock.RLock() 421 dead := tr.state.State == structs.TaskStateDead 422 tr.stateLock.RUnlock() 423 424 // if restoring a dead task, ensure that task is cleared and all post hooks 425 // are called without additional state updates 426 if dead { 427 // do cleanup functions without emitting any additional events/work 428 // to handle cases where we restored a dead task where client terminated 429 // after task finished before completing post-run actions. 430 tr.clearDriverHandle() 431 tr.stateUpdater.TaskStateUpdated() 432 if err := tr.stop(); err != nil { 433 tr.logger.Error("stop failed on terminal task", "error", err) 434 } 435 return 436 } 437 438 // Updates are handled asynchronously with the other hooks but each 439 // triggered update - whether due to alloc updates or a new vault token 440 // - should be handled serially. 441 go tr.handleUpdates() 442 443 // If restore failed wait until servers are contacted before running. 444 // #1795 445 if tr.waitOnServers { 446 tr.logger.Info("task failed to restore; waiting to contact server before restarting") 447 select { 448 case <-tr.killCtx.Done(): 449 case <-tr.shutdownCtx.Done(): 450 return 451 case <-tr.serversContactedCh: 452 tr.logger.Info("server contacted; unblocking waiting task") 453 } 454 } 455 456 MAIN: 457 for !tr.Alloc().TerminalStatus() { 458 select { 459 case <-tr.killCtx.Done(): 460 break MAIN 461 case <-tr.shutdownCtx.Done(): 462 // TaskRunner was told to exit immediately 463 return 464 default: 465 } 466 467 // Run the prestart hooks 468 if err := tr.prestart(); err != nil { 469 tr.logger.Error("prestart failed", "error", err) 470 tr.restartTracker.SetStartError(err) 471 goto RESTART 472 } 473 474 select { 475 case <-tr.killCtx.Done(): 476 break MAIN 477 case <-tr.shutdownCtx.Done(): 478 // TaskRunner was told to exit immediately 479 return 480 default: 481 } 482 483 // Run the task 484 if err := tr.runDriver(); err != nil { 485 tr.logger.Error("running driver failed", "error", err) 486 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err)) 487 tr.restartTracker.SetStartError(err) 488 goto RESTART 489 } 490 491 // Run the poststart hooks 492 if err := tr.poststart(); err != nil { 493 tr.logger.Error("poststart failed", "error", err) 494 } 495 496 // Grab the result proxy and wait for task to exit 497 WAIT: 498 { 499 handle := tr.getDriverHandle() 500 result = nil 501 502 // Do *not* use tr.killCtx here as it would cause 503 // Wait() to unblock before the task exits when Kill() 504 // is called. 505 if resultCh, err := handle.WaitCh(context.Background()); err != nil { 506 tr.logger.Error("wait task failed", "error", err) 507 } else { 508 select { 509 case <-tr.killCtx.Done(): 510 // We can go through the normal should restart check since 511 // the restart tracker knowns it is killed 512 result = tr.handleKill() 513 case <-tr.shutdownCtx.Done(): 514 // TaskRunner was told to exit immediately 515 return 516 case result = <-resultCh: 517 } 518 519 // WaitCh returned a result 520 if retryWait := tr.handleTaskExitResult(result); retryWait { 521 goto WAIT 522 } 523 } 524 } 525 526 // Clear the handle 527 tr.clearDriverHandle() 528 529 // Store the wait result on the restart tracker 530 tr.restartTracker.SetExitResult(result) 531 532 if err := tr.exited(); err != nil { 533 tr.logger.Error("exited hooks failed", "error", err) 534 } 535 536 RESTART: 537 restart, restartDelay := tr.shouldRestart() 538 if !restart { 539 break MAIN 540 } 541 542 // Actually restart by sleeping and also watching for destroy events 543 select { 544 case <-time.After(restartDelay): 545 case <-tr.killCtx.Done(): 546 tr.logger.Trace("task killed between restarts", "delay", restartDelay) 547 break MAIN 548 case <-tr.shutdownCtx.Done(): 549 // TaskRunner was told to exit immediately 550 tr.logger.Trace("gracefully shutting down during restart delay") 551 return 552 } 553 } 554 555 // Ensure handle is cleaned up. Restore could have recovered a task 556 // that should be terminal, so if the handle still exists we should 557 // kill it here. 558 if tr.getDriverHandle() != nil { 559 if result = tr.handleKill(); result != nil { 560 tr.emitExitResultEvent(result) 561 } 562 563 tr.clearDriverHandle() 564 565 if err := tr.exited(); err != nil { 566 tr.logger.Error("exited hooks failed while cleaning up terminal task", "error", err) 567 } 568 } 569 570 // Mark the task as dead 571 tr.UpdateState(structs.TaskStateDead, nil) 572 573 // Run the stop hooks 574 if err := tr.stop(); err != nil { 575 tr.logger.Error("stop failed", "error", err) 576 } 577 578 tr.logger.Debug("task run loop exiting") 579 } 580 581 // handleTaskExitResult handles the results returned by the task exiting. If 582 // retryWait is true, the caller should attempt to wait on the task again since 583 // it has not actually finished running. This can happen if the driver plugin 584 // has exited. 585 func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWait bool) { 586 if result == nil { 587 return false 588 } 589 590 if result.Err == bstructs.ErrPluginShutdown { 591 dn := tr.Task().Driver 592 tr.logger.Debug("driver plugin has shutdown; attempting to recover task", "driver", dn) 593 594 // Initialize a new driver handle 595 if err := tr.initDriver(); err != nil { 596 tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err, "driver", dn) 597 return false 598 } 599 600 // Try to restore the handle 601 tr.stateLock.RLock() 602 h := tr.localState.TaskHandle 603 net := tr.localState.DriverNetwork 604 tr.stateLock.RUnlock() 605 if !tr.restoreHandle(h, net) { 606 tr.logger.Error("failed to restore handle on driver after it exited unexpectedly", "driver", dn) 607 return false 608 } 609 610 tr.logger.Debug("task successfully recovered on driver", "driver", dn) 611 return true 612 } 613 614 // Emit Terminated event 615 tr.emitExitResultEvent(result) 616 617 return false 618 } 619 620 // emitExitResultEvent emits a TaskTerminated event for an ExitResult. 621 func (tr *TaskRunner) emitExitResultEvent(result *drivers.ExitResult) { 622 event := structs.NewTaskEvent(structs.TaskTerminated). 623 SetExitCode(result.ExitCode). 624 SetSignal(result.Signal). 625 SetOOMKilled(result.OOMKilled). 626 SetExitMessage(result.Err) 627 628 tr.EmitEvent(event) 629 630 if result.OOMKilled && !tr.clientConfig.DisableTaggedMetrics { 631 metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels) 632 } 633 } 634 635 // handleUpdates runs update hooks when triggerUpdateCh is ticked and exits 636 // when Run has returned. Should only be run in a goroutine from Run. 637 func (tr *TaskRunner) handleUpdates() { 638 for { 639 select { 640 case <-tr.triggerUpdateCh: 641 case <-tr.waitCh: 642 return 643 } 644 645 // Non-terminal update; run hooks 646 tr.updateHooks() 647 } 648 } 649 650 // shouldRestart determines whether the task should be restarted and updates 651 // the task state unless the task is killed or terminated. 652 func (tr *TaskRunner) shouldRestart() (bool, time.Duration) { 653 // Determine if we should restart 654 state, when := tr.restartTracker.GetState() 655 reason := tr.restartTracker.GetReason() 656 switch state { 657 case structs.TaskKilled: 658 // Never restart an explicitly killed task. Kill method handles 659 // updating the server. 660 tr.EmitEvent(structs.NewTaskEvent(state)) 661 return false, 0 662 case structs.TaskNotRestarting, structs.TaskTerminated: 663 tr.logger.Info("not restarting task", "reason", reason) 664 if state == structs.TaskNotRestarting { 665 tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting).SetRestartReason(reason).SetFailsTask()) 666 } 667 return false, 0 668 case structs.TaskRestarting: 669 tr.logger.Info("restarting task", "reason", reason, "delay", when) 670 tr.UpdateState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when).SetRestartReason(reason)) 671 return true, when 672 default: 673 tr.logger.Error("restart tracker returned unknown state", "state", state) 674 return true, when 675 } 676 } 677 678 // runDriver runs the driver and waits for it to exit 679 func (tr *TaskRunner) runDriver() error { 680 681 taskConfig := tr.buildTaskConfig() 682 683 // Build hcl context variables 684 vars, errs, err := tr.envBuilder.Build().AllValues() 685 if err != nil { 686 return fmt.Errorf("error building environment variables: %v", err) 687 } 688 689 // Handle per-key errors 690 if len(errs) > 0 { 691 keys := make([]string, 0, len(errs)) 692 for k, err := range errs { 693 keys = append(keys, k) 694 695 if tr.logger.IsTrace() { 696 // Verbosely log every diagnostic for debugging 697 tr.logger.Trace("error building environment variables", "key", k, "error", err) 698 } 699 } 700 701 tr.logger.Warn("some environment variables not available for rendering", "keys", strings.Join(keys, ", ")) 702 } 703 704 val, diag := hclutils.ParseHclInterface(tr.task.Config, tr.taskSchema, vars) 705 if diag.HasErrors() { 706 return multierror.Append(errors.New("failed to parse config"), diag.Errs()...) 707 } 708 709 if err := taskConfig.EncodeDriverConfig(val); err != nil { 710 return fmt.Errorf("failed to encode driver config: %v", err) 711 } 712 713 // If there's already a task handle (eg from a Restore) there's nothing 714 // to do except update state. 715 if tr.getDriverHandle() != nil { 716 // Ensure running state is persisted but do *not* append a new 717 // task event as restoring is a client event and not relevant 718 // to a task's lifecycle. 719 if err := tr.updateStateImpl(structs.TaskStateRunning); err != nil { 720 //TODO return error and destroy task to avoid an orphaned task? 721 tr.logger.Warn("error persisting task state", "error", err) 722 } 723 return nil 724 } 725 726 // Start the job if there's no existing handle (or if RecoverTask failed) 727 handle, net, err := tr.driver.StartTask(taskConfig) 728 if err != nil { 729 // The plugin has died, try relaunching it 730 if err == bstructs.ErrPluginShutdown { 731 tr.logger.Info("failed to start task because plugin shutdown unexpectedly; attempting to recover") 732 if err := tr.initDriver(); err != nil { 733 return fmt.Errorf("failed to initialize driver after it exited unexpectedly: %v", err) 734 } 735 736 handle, net, err = tr.driver.StartTask(taskConfig) 737 if err != nil { 738 return fmt.Errorf("failed to start task after driver exited unexpectedly: %v", err) 739 } 740 } else { 741 // Do *NOT* wrap the error here without maintaining 742 // whether or not is Recoverable. 743 return err 744 } 745 } 746 747 tr.stateLock.Lock() 748 tr.localState.TaskHandle = handle 749 tr.localState.DriverNetwork = net 750 if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { 751 //TODO Nomad will be unable to restore this task; try to kill 752 // it now and fail? In general we prefer to leave running 753 // tasks running even if the agent encounters an error. 754 tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart", 755 "error", err, "task_id", handle.Config.ID) 756 } 757 tr.stateLock.Unlock() 758 759 tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) 760 761 // Emit an event that we started 762 tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 763 return nil 764 } 765 766 // initDriver retrives the DriverPlugin from the plugin loader for this task 767 func (tr *TaskRunner) initDriver() error { 768 driver, err := tr.driverManager.Dispense(tr.Task().Driver) 769 if err != nil { 770 return err 771 } 772 tr.driver = driver 773 774 schema, err := tr.driver.TaskConfigSchema() 775 if err != nil { 776 return err 777 } 778 spec, diag := hclspecutils.Convert(schema) 779 if diag.HasErrors() { 780 return multierror.Append(errors.New("failed to convert task schema"), diag.Errs()...) 781 } 782 tr.taskSchema = spec 783 784 caps, err := tr.driver.Capabilities() 785 if err != nil { 786 return err 787 } 788 tr.driverCapabilities = caps 789 790 return nil 791 } 792 793 // handleKill is used to handle the a request to kill a task. It will return 794 // the handle exit result if one is available and store any error in the task 795 // runner killErr value. 796 func (tr *TaskRunner) handleKill() *drivers.ExitResult { 797 // Run the pre killing hooks 798 tr.preKill() 799 800 // Tell the restart tracker that the task has been killed so it doesn't 801 // attempt to restart it. 802 tr.restartTracker.SetKilled() 803 804 // Check it is running 805 handle := tr.getDriverHandle() 806 if handle == nil { 807 return nil 808 } 809 810 // Kill the task using an exponential backoff in-case of failures. 811 killErr := tr.killTask(handle) 812 if killErr != nil { 813 // We couldn't successfully destroy the resource created. 814 tr.logger.Error("failed to kill task. Resources may have been leaked", "error", killErr) 815 tr.setKillErr(killErr) 816 } 817 818 // Block until task has exited. 819 waitCh, err := handle.WaitCh(tr.shutdownCtx) 820 821 // The error should be nil or TaskNotFound, if it's something else then a 822 // failure in the driver or transport layer occurred 823 if err != nil { 824 if err == drivers.ErrTaskNotFound { 825 return nil 826 } 827 tr.logger.Error("failed to wait on task. Resources may have been leaked", "error", err) 828 tr.setKillErr(killErr) 829 return nil 830 } 831 832 select { 833 case result := <-waitCh: 834 return result 835 case <-tr.shutdownCtx.Done(): 836 return nil 837 } 838 } 839 840 // killTask kills the task handle. In the case that killing fails, 841 // killTask will retry with an exponential backoff and will give up at a 842 // given limit. Returns an error if the task could not be killed. 843 func (tr *TaskRunner) killTask(handle *DriverHandle) error { 844 // Cap the number of times we attempt to kill the task. 845 var err error 846 for i := 0; i < killFailureLimit; i++ { 847 if err = handle.Kill(); err != nil { 848 if err == drivers.ErrTaskNotFound { 849 tr.logger.Warn("couldn't find task to kill", "task_id", handle.ID()) 850 return nil 851 } 852 // Calculate the new backoff 853 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 854 if backoff > killBackoffLimit { 855 backoff = killBackoffLimit 856 } 857 858 tr.logger.Error("failed to kill task", "backoff", backoff, "error", err) 859 time.Sleep(backoff) 860 } else { 861 // Kill was successful 862 return nil 863 } 864 } 865 return err 866 } 867 868 // persistLocalState persists local state to disk synchronously. 869 func (tr *TaskRunner) persistLocalState() error { 870 tr.stateLock.RLock() 871 defer tr.stateLock.RUnlock() 872 873 return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState) 874 } 875 876 // buildTaskConfig builds a drivers.TaskConfig with an unique ID for the task. 877 // The ID is unique for every invocation, it is built from the alloc ID, task 878 // name and 8 random characters. 879 func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig { 880 task := tr.Task() 881 alloc := tr.Alloc() 882 invocationid := uuid.Generate()[:8] 883 taskResources := tr.taskResources 884 env := tr.envBuilder.Build() 885 886 return &drivers.TaskConfig{ 887 ID: fmt.Sprintf("%s/%s/%s", alloc.ID, task.Name, invocationid), 888 Name: task.Name, 889 JobName: alloc.Job.Name, 890 TaskGroupName: alloc.TaskGroup, 891 Resources: &drivers.Resources{ 892 NomadResources: taskResources, 893 LinuxResources: &drivers.LinuxResources{ 894 MemoryLimitBytes: taskResources.Memory.MemoryMB * 1024 * 1024, 895 CPUShares: taskResources.Cpu.CpuShares, 896 PercentTicks: float64(taskResources.Cpu.CpuShares) / float64(tr.clientConfig.Node.NodeResources.Cpu.CpuShares), 897 }, 898 }, 899 Devices: tr.hookResources.getDevices(), 900 Mounts: tr.hookResources.getMounts(), 901 Env: env.Map(), 902 DeviceEnv: env.DeviceEnv(), 903 User: task.User, 904 AllocDir: tr.taskDir.AllocDir, 905 StdoutPath: tr.logmonHookConfig.stdoutFifo, 906 StderrPath: tr.logmonHookConfig.stderrFifo, 907 AllocID: tr.allocID, 908 } 909 } 910 911 // Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner 912 // but before Run so no locks need to be acquired. 913 func (tr *TaskRunner) Restore() error { 914 ls, ts, err := tr.stateDB.GetTaskRunnerState(tr.allocID, tr.taskName) 915 if err != nil { 916 return err 917 } 918 919 if ls != nil { 920 ls.Canonicalize() 921 tr.localState = ls 922 } 923 924 if ts != nil { 925 ts.Canonicalize() 926 tr.state = ts 927 } 928 929 // If a TaskHandle was persisted, ensure it is valid or destroy it. 930 if taskHandle := tr.localState.TaskHandle; taskHandle != nil { 931 //TODO if RecoverTask returned the DriverNetwork we wouldn't 932 // have to persist it at all! 933 restored := tr.restoreHandle(taskHandle, tr.localState.DriverNetwork) 934 935 // If the handle could not be restored, the alloc is 936 // non-terminal, and the task isn't a system job: wait until 937 // servers have been contacted before running. #1795 938 if restored { 939 return nil 940 } 941 942 alloc := tr.Alloc() 943 if tr.state.State == structs.TaskStateDead || alloc.TerminalStatus() || alloc.Job.Type == structs.JobTypeSystem { 944 return nil 945 } 946 947 tr.logger.Trace("failed to reattach to task; will not run until server is contacted") 948 tr.waitOnServers = true 949 950 ev := structs.NewTaskEvent(structs.TaskRestoreFailed). 951 SetDisplayMessage("failed to restore task; will not run until server is contacted") 952 tr.UpdateState(structs.TaskStatePending, ev) 953 } 954 955 return nil 956 } 957 958 // restoreHandle ensures a TaskHandle is valid by calling Driver.RecoverTask 959 // and sets the driver handle. If the TaskHandle is not valid, DestroyTask is 960 // called. 961 func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers.DriverNetwork) (success bool) { 962 // Ensure handle is well-formed 963 if taskHandle.Config == nil { 964 return true 965 } 966 967 if err := tr.driver.RecoverTask(taskHandle); err != nil { 968 if tr.TaskState().State != structs.TaskStateRunning { 969 // RecoverTask should fail if the Task wasn't running 970 return true 971 } 972 973 tr.logger.Error("error recovering task; cleaning up", 974 "error", err, "task_id", taskHandle.Config.ID) 975 976 // Try to cleanup any existing task state in the plugin before restarting 977 if err := tr.driver.DestroyTask(taskHandle.Config.ID, true); err != nil { 978 // Ignore ErrTaskNotFound errors as ideally 979 // this task has already been stopped and 980 // therefore doesn't exist. 981 if err != drivers.ErrTaskNotFound { 982 tr.logger.Warn("error destroying unrecoverable task", 983 "error", err, "task_id", taskHandle.Config.ID) 984 } 985 986 return false 987 } 988 989 return true 990 } 991 992 // Update driver handle on task runner 993 tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net)) 994 return true 995 } 996 997 // UpdateState sets the task runners allocation state and triggers a server 998 // update. 999 func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) { 1000 tr.stateLock.Lock() 1001 defer tr.stateLock.Unlock() 1002 1003 if event != nil { 1004 tr.logger.Trace("setting task state", "state", state, "event", event.Type) 1005 1006 // Append the event 1007 tr.appendEvent(event) 1008 } 1009 1010 // Update the state 1011 if err := tr.updateStateImpl(state); err != nil { 1012 // Only log the error as we persistence errors should not 1013 // affect task state. 1014 tr.logger.Error("error persisting task state", "error", err, "event", event, "state", state) 1015 } 1016 1017 // Notify the alloc runner of the transition 1018 tr.stateUpdater.TaskStateUpdated() 1019 } 1020 1021 // updateStateImpl updates the in-memory task state and persists to disk. 1022 func (tr *TaskRunner) updateStateImpl(state string) error { 1023 1024 // Update the task state 1025 oldState := tr.state.State 1026 taskState := tr.state 1027 taskState.State = state 1028 1029 // Handle the state transition. 1030 switch state { 1031 case structs.TaskStateRunning: 1032 // Capture the start time if it is just starting 1033 if oldState != structs.TaskStateRunning { 1034 taskState.StartedAt = time.Now().UTC() 1035 if !tr.clientConfig.DisableTaggedMetrics { 1036 metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels) 1037 } 1038 //if r.config.BackwardsCompatibleMetrics { 1039 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) 1040 //} 1041 } 1042 case structs.TaskStateDead: 1043 // Capture the finished time if not already set 1044 if taskState.FinishedAt.IsZero() { 1045 taskState.FinishedAt = time.Now().UTC() 1046 } 1047 1048 // Emitting metrics to indicate task complete and failures 1049 if taskState.Failed { 1050 if !tr.clientConfig.DisableTaggedMetrics { 1051 metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels) 1052 } 1053 //if r.config.BackwardsCompatibleMetrics { 1054 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1) 1055 //} 1056 } else { 1057 if !tr.clientConfig.DisableTaggedMetrics { 1058 metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, 1, tr.baseLabels) 1059 } 1060 //if r.config.BackwardsCompatibleMetrics { 1061 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1) 1062 //} 1063 } 1064 } 1065 1066 // Persist the state and event 1067 return tr.stateDB.PutTaskState(tr.allocID, tr.taskName, taskState) 1068 } 1069 1070 // EmitEvent appends a new TaskEvent to this task's TaskState. The actual 1071 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1072 // transition states. 1073 // Events are persisted locally and sent to the server, but errors are simply 1074 // logged. Use AppendEvent to simply add a new event. 1075 func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) { 1076 tr.stateLock.Lock() 1077 defer tr.stateLock.Unlock() 1078 1079 tr.appendEvent(event) 1080 1081 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1082 // Only a warning because the next event/state-transition will 1083 // try to persist it again. 1084 tr.logger.Warn("error persisting event", "error", err, "event", event) 1085 } 1086 1087 // Notify the alloc runner of the event 1088 tr.stateUpdater.TaskStateUpdated() 1089 } 1090 1091 // AppendEvent appends a new TaskEvent to this task's TaskState. The actual 1092 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1093 // transition states. 1094 // Events are persisted locally and errors are simply logged. Use EmitEvent 1095 // also update AllocRunner. 1096 func (tr *TaskRunner) AppendEvent(event *structs.TaskEvent) { 1097 tr.stateLock.Lock() 1098 defer tr.stateLock.Unlock() 1099 1100 tr.appendEvent(event) 1101 1102 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1103 // Only a warning because the next event/state-transition will 1104 // try to persist it again. 1105 tr.logger.Warn("error persisting event", "error", err, "event", event) 1106 } 1107 } 1108 1109 // appendEvent to task's event slice. Caller must acquire stateLock. 1110 func (tr *TaskRunner) appendEvent(event *structs.TaskEvent) error { 1111 // Ensure the event is populated with human readable strings 1112 event.PopulateEventDisplayMessage() 1113 1114 // Propagate failure from event to task state 1115 if event.FailsTask { 1116 tr.state.Failed = true 1117 } 1118 1119 // XXX This seems like a super awkward spot for this? Why not shouldRestart? 1120 // Update restart metrics 1121 if event.Type == structs.TaskRestarting { 1122 if !tr.clientConfig.DisableTaggedMetrics { 1123 metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels) 1124 } 1125 //if r.config.BackwardsCompatibleMetrics { 1126 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) 1127 //} 1128 tr.state.Restarts++ 1129 tr.state.LastRestart = time.Unix(0, event.Time) 1130 } 1131 1132 // Append event to slice 1133 appendTaskEvent(tr.state, event, tr.maxEvents) 1134 1135 return nil 1136 } 1137 1138 // WaitCh is closed when TaskRunner.Run exits. 1139 func (tr *TaskRunner) WaitCh() <-chan struct{} { 1140 return tr.waitCh 1141 } 1142 1143 // Update the running allocation with a new version received from the server. 1144 // Calls Update hooks asynchronously with Run. 1145 // 1146 // This method is safe for calling concurrently with Run and does not modify 1147 // the passed in allocation. 1148 func (tr *TaskRunner) Update(update *structs.Allocation) { 1149 task := update.LookupTask(tr.taskName) 1150 if task == nil { 1151 // This should not happen and likely indicates a bug in the 1152 // server or client. 1153 tr.logger.Error("allocation update is missing task; killing", 1154 "group", update.TaskGroup) 1155 te := structs.NewTaskEvent(structs.TaskKilled). 1156 SetKillReason("update missing task"). 1157 SetFailsTask() 1158 tr.Kill(context.Background(), te) 1159 return 1160 } 1161 1162 // Update tr.alloc 1163 tr.setAlloc(update, task) 1164 1165 // Trigger update hooks if not terminal 1166 if !update.TerminalStatus() { 1167 tr.triggerUpdateHooks() 1168 } 1169 } 1170 1171 // triggerUpdate if there isn't already an update pending. Should be called 1172 // instead of calling updateHooks directly to serialize runs of update hooks. 1173 // TaskRunner state should be updated prior to triggering update hooks. 1174 // 1175 // Does not block. 1176 func (tr *TaskRunner) triggerUpdateHooks() { 1177 select { 1178 case tr.triggerUpdateCh <- struct{}{}: 1179 default: 1180 // already an update hook pending 1181 } 1182 } 1183 1184 // Shutdown TaskRunner gracefully without affecting the state of the task. 1185 // Shutdown blocks until the main Run loop exits. 1186 func (tr *TaskRunner) Shutdown() { 1187 tr.logger.Trace("shutting down") 1188 tr.shutdownCtxCancel() 1189 1190 <-tr.WaitCh() 1191 1192 // Run shutdown hooks to cleanup 1193 tr.shutdownHooks() 1194 1195 // Persist once more 1196 tr.persistLocalState() 1197 } 1198 1199 // LatestResourceUsage returns the last resource utilization datapoint 1200 // collected. May return nil if the task is not running or no resource 1201 // utilization has been collected yet. 1202 func (tr *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1203 tr.resourceUsageLock.Lock() 1204 ru := tr.resourceUsage 1205 tr.resourceUsageLock.Unlock() 1206 1207 // Look up device statistics lazily when fetched, as currently we do not emit any stats for them yet 1208 if ru != nil && tr.deviceStatsReporter != nil { 1209 deviceResources := tr.taskResources.Devices 1210 ru.ResourceUsage.DeviceStats = tr.deviceStatsReporter.LatestDeviceResourceStats(deviceResources) 1211 } 1212 return ru 1213 } 1214 1215 // UpdateStats updates and emits the latest stats from the driver. 1216 func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) { 1217 tr.resourceUsageLock.Lock() 1218 tr.resourceUsage = ru 1219 tr.resourceUsageLock.Unlock() 1220 if ru != nil { 1221 tr.emitStats(ru) 1222 } 1223 } 1224 1225 //TODO Remove Backwardscompat or use tr.Alloc()? 1226 func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { 1227 alloc := tr.Alloc() 1228 var allocatedMem float32 1229 if alloc.AllocatedResources != nil { 1230 if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { 1231 // Convert to bytes to match other memory metrics 1232 allocatedMem = float32(taskRes.Memory.MemoryMB) * 1024 * 1024 1233 } 1234 } else if taskRes := alloc.TaskResources[tr.taskName]; taskRes != nil { 1235 // COMPAT(0.11) Remove in 0.11 when TaskResources is removed 1236 allocatedMem = float32(taskRes.MemoryMB) * 1024 * 1024 1237 1238 } 1239 1240 if !tr.clientConfig.DisableTaggedMetrics { 1241 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1242 float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels) 1243 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"}, 1244 float32(ru.ResourceUsage.MemoryStats.Cache), tr.baseLabels) 1245 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"}, 1246 float32(ru.ResourceUsage.MemoryStats.Swap), tr.baseLabels) 1247 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "usage"}, 1248 float32(ru.ResourceUsage.MemoryStats.Usage), tr.baseLabels) 1249 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"}, 1250 float32(ru.ResourceUsage.MemoryStats.MaxUsage), tr.baseLabels) 1251 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"}, 1252 float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels) 1253 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, 1254 float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels) 1255 if allocatedMem > 0 { 1256 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "allocated"}, 1257 allocatedMem, tr.baseLabels) 1258 } 1259 } 1260 1261 if tr.clientConfig.BackwardsCompatibleMetrics { 1262 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1263 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1264 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1265 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage)) 1266 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1267 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1268 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1269 if allocatedMem > 0 { 1270 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "allocated"}, allocatedMem) 1271 } 1272 } 1273 } 1274 1275 //TODO Remove Backwardscompat or use tr.Alloc()? 1276 func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) { 1277 if !tr.clientConfig.DisableTaggedMetrics { 1278 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"}, 1279 float32(ru.ResourceUsage.CpuStats.Percent), tr.baseLabels) 1280 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"}, 1281 float32(ru.ResourceUsage.CpuStats.SystemMode), tr.baseLabels) 1282 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"}, 1283 float32(ru.ResourceUsage.CpuStats.UserMode), tr.baseLabels) 1284 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"}, 1285 float32(ru.ResourceUsage.CpuStats.ThrottledTime), tr.baseLabels) 1286 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"}, 1287 float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels) 1288 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"}, 1289 float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels) 1290 } 1291 1292 if tr.clientConfig.BackwardsCompatibleMetrics { 1293 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1294 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1295 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1296 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1297 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1298 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1299 } 1300 } 1301 1302 // emitStats emits resource usage stats of tasks to remote metrics collector 1303 // sinks 1304 func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1305 if !tr.clientConfig.PublishAllocationMetrics { 1306 return 1307 } 1308 1309 if ru.ResourceUsage.MemoryStats != nil { 1310 tr.setGaugeForMemory(ru) 1311 } 1312 1313 if ru.ResourceUsage.CpuStats != nil { 1314 tr.setGaugeForCPU(ru) 1315 } 1316 } 1317 1318 // appendTaskEvent updates the task status by appending the new event. 1319 func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent, capacity int) { 1320 if state.Events == nil { 1321 state.Events = make([]*structs.TaskEvent, 1, capacity) 1322 state.Events[0] = event 1323 return 1324 } 1325 1326 // If we hit capacity, then shift it. 1327 if len(state.Events) == capacity { 1328 old := state.Events 1329 state.Events = make([]*structs.TaskEvent, 0, capacity) 1330 state.Events = append(state.Events, old[1:]...) 1331 } 1332 1333 state.Events = append(state.Events, event) 1334 } 1335 1336 func (tr *TaskRunner) TaskExecHandler() drivermanager.TaskExecHandler { 1337 return tr.getDriverHandle().ExecStreaming 1338 } 1339 1340 func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) { 1341 return tr.driver.Capabilities() 1342 }