github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/client/allocrunner/taskrunner/task_runner.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "strings" 8 "sync" 9 "time" 10 11 metrics "github.com/armon/go-metrics" 12 log "github.com/hashicorp/go-hclog" 13 multierror "github.com/hashicorp/go-multierror" 14 "github.com/hashicorp/hcl2/hcldec" 15 "github.com/hashicorp/nomad/client/allocdir" 16 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 17 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts" 18 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" 19 "github.com/hashicorp/nomad/client/config" 20 "github.com/hashicorp/nomad/client/consul" 21 "github.com/hashicorp/nomad/client/devicemanager" 22 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 23 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 24 cstate "github.com/hashicorp/nomad/client/state" 25 cstructs "github.com/hashicorp/nomad/client/structs" 26 "github.com/hashicorp/nomad/client/taskenv" 27 "github.com/hashicorp/nomad/client/vaultclient" 28 "github.com/hashicorp/nomad/helper/pluginutils/hclspecutils" 29 "github.com/hashicorp/nomad/helper/pluginutils/hclutils" 30 "github.com/hashicorp/nomad/helper/uuid" 31 "github.com/hashicorp/nomad/nomad/structs" 32 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 33 "github.com/hashicorp/nomad/plugins/drivers" 34 ) 35 36 const ( 37 // defaultMaxEvents is the default max capacity for task events on the 38 // task state. Overrideable for testing. 39 defaultMaxEvents = 10 40 41 // killBackoffBaseline is the baseline time for exponential backoff while 42 // killing a task. 43 killBackoffBaseline = 5 * time.Second 44 45 // killBackoffLimit is the limit of the exponential backoff for killing 46 // the task. 47 killBackoffLimit = 2 * time.Minute 48 49 // killFailureLimit is how many times we will attempt to kill a task before 50 // giving up and potentially leaking resources. 51 killFailureLimit = 5 52 53 // triggerUpdatechCap is the capacity for the triggerUpdateCh used for 54 // triggering updates. It should be exactly 1 as even if multiple 55 // updates have come in since the last one was handled, we only need to 56 // handle the last one. 57 triggerUpdateChCap = 1 58 ) 59 60 type TaskRunner struct { 61 // allocID, taskName, taskLeader, and taskResources are immutable so these fields may 62 // be accessed without locks 63 allocID string 64 taskName string 65 taskLeader bool 66 taskResources *structs.AllocatedTaskResources 67 68 alloc *structs.Allocation 69 allocLock sync.Mutex 70 71 clientConfig *config.Config 72 73 // stateUpdater is used to emit updated task state 74 stateUpdater interfaces.TaskStateHandler 75 76 // state captures the state of the task for updating the allocation 77 // Must acquire stateLock to access. 78 state *structs.TaskState 79 80 // localState captures the node-local state of the task for when the 81 // Nomad agent restarts. 82 // Must acquire stateLock to access. 83 localState *state.LocalState 84 85 // stateLock must be acquired when accessing state or localState. 86 stateLock sync.RWMutex 87 88 // stateDB is for persisting localState and taskState 89 stateDB cstate.StateDB 90 91 // shutdownCtx is used to exit the TaskRunner *without* affecting task state. 92 shutdownCtx context.Context 93 94 // shutdownCtxCancel causes the TaskRunner to exit immediately without 95 // affecting task state. Useful for testing or graceful agent shutdown. 96 shutdownCtxCancel context.CancelFunc 97 98 // killCtx is the task runner's context representing the tasks's lifecycle. 99 // The context is canceled when the task is killed. 100 killCtx context.Context 101 102 // killCtxCancel is called when killing a task. 103 killCtxCancel context.CancelFunc 104 105 // killErr is populated when killing a task. Access should be done use the 106 // getter/setter 107 killErr error 108 killErrLock sync.Mutex 109 110 // Logger is the logger for the task runner. 111 logger log.Logger 112 113 // triggerUpdateCh is ticked whenever update hooks need to be run and 114 // must be created with cap=1 to signal a pending update and prevent 115 // callers from deadlocking if the receiver has exited. 116 triggerUpdateCh chan struct{} 117 118 // waitCh is closed when the task runner has transitioned to a terminal 119 // state 120 waitCh chan struct{} 121 122 // driver is the driver for the task. 123 driver drivers.DriverPlugin 124 125 // driverCapabilities is the set capabilities the driver supports 126 driverCapabilities *drivers.Capabilities 127 128 // taskSchema is the hcl spec for the task driver configuration 129 taskSchema hcldec.Spec 130 131 // handleLock guards access to handle and handleResult 132 handleLock sync.Mutex 133 134 // handle to the running driver 135 handle *DriverHandle 136 137 // task is the task being run 138 task *structs.Task 139 taskLock sync.RWMutex 140 141 // taskDir is the directory structure for this task. 142 taskDir *allocdir.TaskDir 143 144 // envBuilder is used to build the task's environment 145 envBuilder *taskenv.Builder 146 147 // restartTracker is used to decide if the task should be restarted. 148 restartTracker *restarts.RestartTracker 149 150 // runnerHooks are task runner lifecycle hooks that should be run on state 151 // transistions. 152 runnerHooks []interfaces.TaskHook 153 154 // hookResources captures the resources provided by hooks 155 hookResources *hookResources 156 157 // consulClient is the client used by the consul service hook for 158 // registering services and checks 159 consulClient consul.ConsulServiceAPI 160 161 // vaultClient is the client to use to derive and renew Vault tokens 162 vaultClient vaultclient.VaultClient 163 164 // vaultToken is the current Vault token. It should be accessed with the 165 // getter. 166 vaultToken string 167 vaultTokenLock sync.Mutex 168 169 // baseLabels are used when emitting tagged metrics. All task runner metrics 170 // will have these tags, and optionally more. 171 baseLabels []metrics.Label 172 173 // logmonHookConfig is used to get the paths to the stdout and stderr fifos 174 // to be passed to the driver for task logging 175 logmonHookConfig *logmonHookConfig 176 177 // resourceUsage is written via UpdateStats and read via 178 // LatestResourceUsage. May be nil at all times. 179 resourceUsage *cstructs.TaskResourceUsage 180 resourceUsageLock sync.Mutex 181 182 // deviceStatsReporter is used to lookup resource usage for alloc devices 183 deviceStatsReporter cinterfaces.DeviceStatsReporter 184 185 // devicemanager is used to mount devices as well as lookup device 186 // statistics 187 devicemanager devicemanager.Manager 188 189 // driverManager is used to dispense driver plugins and register event 190 // handlers 191 driverManager drivermanager.Manager 192 193 // maxEvents is the capacity of the TaskEvents on the TaskState. 194 // Defaults to defaultMaxEvents but overrideable for testing. 195 maxEvents int 196 197 // serversContactedCh is passed to TaskRunners so they can detect when 198 // GetClientAllocs has been called in case of a failed restore. 199 serversContactedCh <-chan struct{} 200 201 // waitOnServers defaults to false but will be set true if a restore 202 // fails and the Run method should wait until serversContactedCh is 203 // closed. 204 waitOnServers bool 205 } 206 207 type Config struct { 208 Alloc *structs.Allocation 209 ClientConfig *config.Config 210 Consul consul.ConsulServiceAPI 211 Task *structs.Task 212 TaskDir *allocdir.TaskDir 213 Logger log.Logger 214 215 // Vault is the client to use to derive and renew Vault tokens 216 Vault vaultclient.VaultClient 217 218 // StateDB is used to store and restore state. 219 StateDB cstate.StateDB 220 221 // StateUpdater is used to emit updated task state 222 StateUpdater interfaces.TaskStateHandler 223 224 // deviceStatsReporter is used to lookup resource usage for alloc devices 225 DeviceStatsReporter cinterfaces.DeviceStatsReporter 226 227 // DeviceManager is used to mount devices as well as lookup device 228 // statistics 229 DeviceManager devicemanager.Manager 230 231 // DriverManager is used to dispense driver plugins and register event 232 // handlers 233 DriverManager drivermanager.Manager 234 235 // ServersContactedCh is closed when the first GetClientAllocs call to 236 // servers succeeds and allocs are synced. 237 ServersContactedCh chan struct{} 238 } 239 240 func NewTaskRunner(config *Config) (*TaskRunner, error) { 241 // Create a context for causing the runner to exit 242 trCtx, trCancel := context.WithCancel(context.Background()) 243 244 // Create a context for killing the runner 245 killCtx, killCancel := context.WithCancel(context.Background()) 246 247 // Initialize the environment builder 248 envBuilder := taskenv.NewBuilder( 249 config.ClientConfig.Node, 250 config.Alloc, 251 config.Task, 252 config.ClientConfig.Region, 253 ) 254 255 // Initialize state from alloc if it is set 256 tstate := structs.NewTaskState() 257 if ts := config.Alloc.TaskStates[config.Task.Name]; ts != nil { 258 tstate = ts.Copy() 259 } 260 261 tr := &TaskRunner{ 262 alloc: config.Alloc, 263 allocID: config.Alloc.ID, 264 clientConfig: config.ClientConfig, 265 task: config.Task, 266 taskDir: config.TaskDir, 267 taskName: config.Task.Name, 268 taskLeader: config.Task.Leader, 269 envBuilder: envBuilder, 270 consulClient: config.Consul, 271 vaultClient: config.Vault, 272 state: tstate, 273 localState: state.NewLocalState(), 274 stateDB: config.StateDB, 275 stateUpdater: config.StateUpdater, 276 deviceStatsReporter: config.DeviceStatsReporter, 277 killCtx: killCtx, 278 killCtxCancel: killCancel, 279 shutdownCtx: trCtx, 280 shutdownCtxCancel: trCancel, 281 triggerUpdateCh: make(chan struct{}, triggerUpdateChCap), 282 waitCh: make(chan struct{}), 283 devicemanager: config.DeviceManager, 284 driverManager: config.DriverManager, 285 maxEvents: defaultMaxEvents, 286 serversContactedCh: config.ServersContactedCh, 287 } 288 289 // Create the logger based on the allocation ID 290 tr.logger = config.Logger.Named("task_runner").With("task", config.Task.Name) 291 292 // Pull out the task's resources 293 ares := tr.alloc.AllocatedResources 294 if ares != nil { 295 tres, ok := ares.Tasks[tr.taskName] 296 if !ok { 297 return nil, fmt.Errorf("no task resources found on allocation") 298 } 299 tr.taskResources = tres 300 } else { 301 // COMPAT(0.10): Upgrade from old resources to new resources 302 // Grab the old task resources 303 oldTr, ok := tr.alloc.TaskResources[tr.taskName] 304 if !ok { 305 return nil, fmt.Errorf("no task resources found on allocation") 306 } 307 308 // Convert the old to new 309 tr.taskResources = &structs.AllocatedTaskResources{ 310 Cpu: structs.AllocatedCpuResources{ 311 CpuShares: int64(oldTr.CPU), 312 }, 313 Memory: structs.AllocatedMemoryResources{ 314 MemoryMB: int64(oldTr.MemoryMB), 315 }, 316 Networks: oldTr.Networks, 317 } 318 } 319 320 // Build the restart tracker. 321 tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup) 322 if tg == nil { 323 tr.logger.Error("alloc missing task group") 324 return nil, fmt.Errorf("alloc missing task group") 325 } 326 tr.restartTracker = restarts.NewRestartTracker(tg.RestartPolicy, tr.alloc.Job.Type) 327 328 // Get the driver 329 if err := tr.initDriver(); err != nil { 330 tr.logger.Error("failed to create driver", "error", err) 331 return nil, err 332 } 333 334 // Initialize the runners hooks. 335 tr.initHooks() 336 337 // Initialize base labels 338 tr.initLabels() 339 340 // Initialize initial task received event 341 tr.appendEvent(structs.NewTaskEvent(structs.TaskReceived)) 342 343 return tr, nil 344 } 345 346 func (tr *TaskRunner) initLabels() { 347 alloc := tr.Alloc() 348 tr.baseLabels = []metrics.Label{ 349 { 350 Name: "job", 351 Value: alloc.Job.Name, 352 }, 353 { 354 Name: "task_group", 355 Value: alloc.TaskGroup, 356 }, 357 { 358 Name: "alloc_id", 359 Value: tr.allocID, 360 }, 361 { 362 Name: "task", 363 Value: tr.taskName, 364 }, 365 } 366 367 if tr.alloc.Job.ParentID != "" { 368 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 369 Name: "parent_id", 370 Value: tr.alloc.Job.ParentID, 371 }) 372 if strings.Contains(tr.alloc.Job.Name, "/dispatch-") { 373 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 374 Name: "dispatch_id", 375 Value: strings.Split(tr.alloc.Job.Name, "/dispatch-")[1], 376 }) 377 } 378 if strings.Contains(tr.alloc.Job.Name, "/periodic-") { 379 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 380 Name: "periodic_id", 381 Value: strings.Split(tr.alloc.Job.Name, "/periodic-")[1], 382 }) 383 } 384 } 385 } 386 387 // Run the TaskRunner. Starts the user's task or reattaches to a restored task. 388 // Run closes WaitCh when it exits. Should be started in a goroutine. 389 func (tr *TaskRunner) Run() { 390 defer close(tr.waitCh) 391 var result *drivers.ExitResult 392 393 // Updates are handled asynchronously with the other hooks but each 394 // triggered update - whether due to alloc updates or a new vault token 395 // - should be handled serially. 396 go tr.handleUpdates() 397 398 // If restore failed wait until servers are contacted before running. 399 // #1795 400 if tr.waitOnServers { 401 tr.logger.Info("task failed to restore; waiting to contact server before restarting") 402 select { 403 case <-tr.killCtx.Done(): 404 case <-tr.shutdownCtx.Done(): 405 return 406 case <-tr.serversContactedCh: 407 tr.logger.Info("server contacted; unblocking waiting task") 408 } 409 } 410 411 MAIN: 412 for !tr.Alloc().TerminalStatus() { 413 select { 414 case <-tr.killCtx.Done(): 415 break MAIN 416 case <-tr.shutdownCtx.Done(): 417 // TaskRunner was told to exit immediately 418 return 419 default: 420 } 421 422 // Run the prestart hooks 423 if err := tr.prestart(); err != nil { 424 tr.logger.Error("prestart failed", "error", err) 425 tr.restartTracker.SetStartError(err) 426 goto RESTART 427 } 428 429 select { 430 case <-tr.killCtx.Done(): 431 break MAIN 432 case <-tr.shutdownCtx.Done(): 433 // TaskRunner was told to exit immediately 434 return 435 default: 436 } 437 438 // Run the task 439 if err := tr.runDriver(); err != nil { 440 tr.logger.Error("running driver failed", "error", err) 441 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err)) 442 tr.restartTracker.SetStartError(err) 443 goto RESTART 444 } 445 446 // Run the poststart hooks 447 if err := tr.poststart(); err != nil { 448 tr.logger.Error("poststart failed", "error", err) 449 } 450 451 // Grab the result proxy and wait for task to exit 452 WAIT: 453 { 454 handle := tr.getDriverHandle() 455 result = nil 456 457 // Do *not* use tr.killCtx here as it would cause 458 // Wait() to unblock before the task exits when Kill() 459 // is called. 460 if resultCh, err := handle.WaitCh(context.Background()); err != nil { 461 tr.logger.Error("wait task failed", "error", err) 462 } else { 463 select { 464 case <-tr.killCtx.Done(): 465 // We can go through the normal should restart check since 466 // the restart tracker knowns it is killed 467 result = tr.handleKill() 468 case <-tr.shutdownCtx.Done(): 469 // TaskRunner was told to exit immediately 470 return 471 case result = <-resultCh: 472 } 473 474 // WaitCh returned a result 475 if retryWait := tr.handleTaskExitResult(result); retryWait { 476 goto WAIT 477 } 478 } 479 } 480 481 // Clear the handle 482 tr.clearDriverHandle() 483 484 // Store the wait result on the restart tracker 485 tr.restartTracker.SetExitResult(result) 486 487 if err := tr.exited(); err != nil { 488 tr.logger.Error("exited hooks failed", "error", err) 489 } 490 491 RESTART: 492 restart, restartDelay := tr.shouldRestart() 493 if !restart { 494 break MAIN 495 } 496 497 // Actually restart by sleeping and also watching for destroy events 498 select { 499 case <-time.After(restartDelay): 500 case <-tr.killCtx.Done(): 501 tr.logger.Trace("task killed between restarts", "delay", restartDelay) 502 break MAIN 503 case <-tr.shutdownCtx.Done(): 504 // TaskRunner was told to exit immediately 505 tr.logger.Trace("gracefully shutting down during restart delay") 506 return 507 } 508 } 509 510 // Ensure handle is cleaned up. Restore could have recovered a task 511 // that should be terminal, so if the handle still exists we should 512 // kill it here. 513 if tr.getDriverHandle() != nil { 514 if result = tr.handleKill(); result != nil { 515 tr.emitExitResultEvent(result) 516 } 517 518 tr.clearDriverHandle() 519 520 if err := tr.exited(); err != nil { 521 tr.logger.Error("exited hooks failed while cleaning up terminal task", "error", err) 522 } 523 } 524 525 // Mark the task as dead 526 tr.UpdateState(structs.TaskStateDead, nil) 527 528 // Run the stop hooks 529 if err := tr.stop(); err != nil { 530 tr.logger.Error("stop failed", "error", err) 531 } 532 533 tr.logger.Debug("task run loop exiting") 534 } 535 536 // handleTaskExitResult handles the results returned by the task exiting. If 537 // retryWait is true, the caller should attempt to wait on the task again since 538 // it has not actually finished running. This can happen if the driver plugin 539 // has exited. 540 func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWait bool) { 541 if result == nil { 542 return false 543 } 544 545 if result.Err == bstructs.ErrPluginShutdown { 546 dn := tr.Task().Driver 547 tr.logger.Debug("driver plugin has shutdown; attempting to recover task", "driver", dn) 548 549 // Initialize a new driver handle 550 if err := tr.initDriver(); err != nil { 551 tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err, "driver", dn) 552 return false 553 } 554 555 // Try to restore the handle 556 tr.stateLock.RLock() 557 h := tr.localState.TaskHandle 558 net := tr.localState.DriverNetwork 559 tr.stateLock.RUnlock() 560 if !tr.restoreHandle(h, net) { 561 tr.logger.Error("failed to restore handle on driver after it exited unexpectedly", "driver", dn) 562 return false 563 } 564 565 tr.logger.Debug("task successfully recovered on driver", "driver", dn) 566 return true 567 } 568 569 // Emit Terminated event 570 tr.emitExitResultEvent(result) 571 572 return false 573 } 574 575 // emitExitResultEvent emits a TaskTerminated event for an ExitResult. 576 func (tr *TaskRunner) emitExitResultEvent(result *drivers.ExitResult) { 577 event := structs.NewTaskEvent(structs.TaskTerminated). 578 SetExitCode(result.ExitCode). 579 SetSignal(result.Signal). 580 SetOOMKilled(result.OOMKilled). 581 SetExitMessage(result.Err) 582 583 tr.EmitEvent(event) 584 585 if result.OOMKilled && !tr.clientConfig.DisableTaggedMetrics { 586 metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels) 587 } 588 } 589 590 // handleUpdates runs update hooks when triggerUpdateCh is ticked and exits 591 // when Run has returned. Should only be run in a goroutine from Run. 592 func (tr *TaskRunner) handleUpdates() { 593 for { 594 select { 595 case <-tr.triggerUpdateCh: 596 case <-tr.waitCh: 597 return 598 } 599 600 // Non-terminal update; run hooks 601 tr.updateHooks() 602 } 603 } 604 605 // shouldRestart determines whether the task should be restarted and updates 606 // the task state unless the task is killed or terminated. 607 func (tr *TaskRunner) shouldRestart() (bool, time.Duration) { 608 // Determine if we should restart 609 state, when := tr.restartTracker.GetState() 610 reason := tr.restartTracker.GetReason() 611 switch state { 612 case structs.TaskKilled: 613 // Never restart an explicitly killed task. Kill method handles 614 // updating the server. 615 tr.EmitEvent(structs.NewTaskEvent(state)) 616 return false, 0 617 case structs.TaskNotRestarting, structs.TaskTerminated: 618 tr.logger.Info("not restarting task", "reason", reason) 619 if state == structs.TaskNotRestarting { 620 tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting).SetRestartReason(reason).SetFailsTask()) 621 } 622 return false, 0 623 case structs.TaskRestarting: 624 tr.logger.Info("restarting task", "reason", reason, "delay", when) 625 tr.UpdateState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when).SetRestartReason(reason)) 626 return true, when 627 default: 628 tr.logger.Error("restart tracker returned unknown state", "state", state) 629 return true, when 630 } 631 } 632 633 // runDriver runs the driver and waits for it to exit 634 func (tr *TaskRunner) runDriver() error { 635 636 taskConfig := tr.buildTaskConfig() 637 638 // Build hcl context variables 639 vars, errs, err := tr.envBuilder.Build().AllValues() 640 if err != nil { 641 return fmt.Errorf("error building environment variables: %v", err) 642 } 643 644 // Handle per-key errors 645 if len(errs) > 0 { 646 keys := make([]string, 0, len(errs)) 647 for k, err := range errs { 648 keys = append(keys, k) 649 650 if tr.logger.IsTrace() { 651 // Verbosely log every diagnostic for debugging 652 tr.logger.Trace("error building environment variables", "key", k, "error", err) 653 } 654 } 655 656 tr.logger.Warn("some environment variables not available for rendering", "keys", strings.Join(keys, ", ")) 657 } 658 659 val, diag := hclutils.ParseHclInterface(tr.task.Config, tr.taskSchema, vars) 660 if diag.HasErrors() { 661 return multierror.Append(errors.New("failed to parse config"), diag.Errs()...) 662 } 663 664 if err := taskConfig.EncodeDriverConfig(val); err != nil { 665 return fmt.Errorf("failed to encode driver config: %v", err) 666 } 667 668 // If there's already a task handle (eg from a Restore) there's nothing 669 // to do except update state. 670 if tr.getDriverHandle() != nil { 671 // Ensure running state is persisted but do *not* append a new 672 // task event as restoring is a client event and not relevant 673 // to a task's lifecycle. 674 if err := tr.updateStateImpl(structs.TaskStateRunning); err != nil { 675 //TODO return error and destroy task to avoid an orphaned task? 676 tr.logger.Warn("error persisting task state", "error", err) 677 } 678 return nil 679 } 680 681 // Start the job if there's no existing handle (or if RecoverTask failed) 682 handle, net, err := tr.driver.StartTask(taskConfig) 683 if err != nil { 684 // The plugin has died, try relaunching it 685 if err == bstructs.ErrPluginShutdown { 686 tr.logger.Info("failed to start task because plugin shutdown unexpectedly; attempting to recover") 687 if err := tr.initDriver(); err != nil { 688 return fmt.Errorf("failed to initialize driver after it exited unexpectedly: %v", err) 689 } 690 691 handle, net, err = tr.driver.StartTask(taskConfig) 692 if err != nil { 693 return fmt.Errorf("failed to start task after driver exited unexpectedly: %v", err) 694 } 695 } else { 696 // Do *NOT* wrap the error here without maintaining 697 // whether or not is Recoverable. 698 return err 699 } 700 } 701 702 tr.stateLock.Lock() 703 tr.localState.TaskHandle = handle 704 tr.localState.DriverNetwork = net 705 if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { 706 //TODO Nomad will be unable to restore this task; try to kill 707 // it now and fail? In general we prefer to leave running 708 // tasks running even if the agent encounters an error. 709 tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart", 710 "error", err, "task_id", handle.Config.ID) 711 } 712 tr.stateLock.Unlock() 713 714 tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) 715 716 // Emit an event that we started 717 tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 718 return nil 719 } 720 721 // initDriver retrives the DriverPlugin from the plugin loader for this task 722 func (tr *TaskRunner) initDriver() error { 723 driver, err := tr.driverManager.Dispense(tr.Task().Driver) 724 if err != nil { 725 return err 726 } 727 tr.driver = driver 728 729 schema, err := tr.driver.TaskConfigSchema() 730 if err != nil { 731 return err 732 } 733 spec, diag := hclspecutils.Convert(schema) 734 if diag.HasErrors() { 735 return multierror.Append(errors.New("failed to convert task schema"), diag.Errs()...) 736 } 737 tr.taskSchema = spec 738 739 caps, err := tr.driver.Capabilities() 740 if err != nil { 741 return err 742 } 743 tr.driverCapabilities = caps 744 745 return nil 746 } 747 748 // handleKill is used to handle the a request to kill a task. It will return 749 // the handle exit result if one is available and store any error in the task 750 // runner killErr value. 751 func (tr *TaskRunner) handleKill() *drivers.ExitResult { 752 // Run the pre killing hooks 753 tr.preKill() 754 755 // Tell the restart tracker that the task has been killed so it doesn't 756 // attempt to restart it. 757 tr.restartTracker.SetKilled() 758 759 // Check it is running 760 handle := tr.getDriverHandle() 761 if handle == nil { 762 return nil 763 } 764 765 // Kill the task using an exponential backoff in-case of failures. 766 killErr := tr.killTask(handle) 767 if killErr != nil { 768 // We couldn't successfully destroy the resource created. 769 tr.logger.Error("failed to kill task. Resources may have been leaked", "error", killErr) 770 tr.setKillErr(killErr) 771 } 772 773 // Block until task has exited. 774 waitCh, err := handle.WaitCh(tr.shutdownCtx) 775 776 // The error should be nil or TaskNotFound, if it's something else then a 777 // failure in the driver or transport layer occurred 778 if err != nil { 779 if err == drivers.ErrTaskNotFound { 780 return nil 781 } 782 tr.logger.Error("failed to wait on task. Resources may have been leaked", "error", err) 783 tr.setKillErr(killErr) 784 return nil 785 } 786 787 select { 788 case result := <-waitCh: 789 return result 790 case <-tr.shutdownCtx.Done(): 791 return nil 792 } 793 } 794 795 // killTask kills the task handle. In the case that killing fails, 796 // killTask will retry with an exponential backoff and will give up at a 797 // given limit. Returns an error if the task could not be killed. 798 func (tr *TaskRunner) killTask(handle *DriverHandle) error { 799 // Cap the number of times we attempt to kill the task. 800 var err error 801 for i := 0; i < killFailureLimit; i++ { 802 if err = handle.Kill(); err != nil { 803 if err == drivers.ErrTaskNotFound { 804 tr.logger.Warn("couldn't find task to kill", "task_id", handle.ID()) 805 return nil 806 } 807 // Calculate the new backoff 808 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 809 if backoff > killBackoffLimit { 810 backoff = killBackoffLimit 811 } 812 813 tr.logger.Error("failed to kill task", "backoff", backoff, "error", err) 814 time.Sleep(backoff) 815 } else { 816 // Kill was successful 817 return nil 818 } 819 } 820 return err 821 } 822 823 // persistLocalState persists local state to disk synchronously. 824 func (tr *TaskRunner) persistLocalState() error { 825 tr.stateLock.RLock() 826 defer tr.stateLock.RUnlock() 827 828 return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState) 829 } 830 831 // buildTaskConfig builds a drivers.TaskConfig with an unique ID for the task. 832 // The ID is unique for every invocation, it is built from the alloc ID, task 833 // name and 8 random characters. 834 func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig { 835 task := tr.Task() 836 alloc := tr.Alloc() 837 invocationid := uuid.Generate()[:8] 838 taskResources := tr.taskResources 839 env := tr.envBuilder.Build() 840 841 return &drivers.TaskConfig{ 842 ID: fmt.Sprintf("%s/%s/%s", alloc.ID, task.Name, invocationid), 843 Name: task.Name, 844 JobName: alloc.Job.Name, 845 TaskGroupName: alloc.TaskGroup, 846 Resources: &drivers.Resources{ 847 NomadResources: taskResources, 848 LinuxResources: &drivers.LinuxResources{ 849 MemoryLimitBytes: taskResources.Memory.MemoryMB * 1024 * 1024, 850 CPUShares: taskResources.Cpu.CpuShares, 851 PercentTicks: float64(taskResources.Cpu.CpuShares) / float64(tr.clientConfig.Node.NodeResources.Cpu.CpuShares), 852 }, 853 }, 854 Devices: tr.hookResources.getDevices(), 855 Mounts: tr.hookResources.getMounts(), 856 Env: env.Map(), 857 DeviceEnv: env.DeviceEnv(), 858 User: task.User, 859 AllocDir: tr.taskDir.AllocDir, 860 StdoutPath: tr.logmonHookConfig.stdoutFifo, 861 StderrPath: tr.logmonHookConfig.stderrFifo, 862 AllocID: tr.allocID, 863 } 864 } 865 866 // Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner 867 // but before Run so no locks need to be acquired. 868 func (tr *TaskRunner) Restore() error { 869 ls, ts, err := tr.stateDB.GetTaskRunnerState(tr.allocID, tr.taskName) 870 if err != nil { 871 return err 872 } 873 874 if ls != nil { 875 ls.Canonicalize() 876 tr.localState = ls 877 } 878 879 if ts != nil { 880 ts.Canonicalize() 881 tr.state = ts 882 } 883 884 // If a TaskHandle was persisted, ensure it is valid or destroy it. 885 if taskHandle := tr.localState.TaskHandle; taskHandle != nil { 886 //TODO if RecoverTask returned the DriverNetwork we wouldn't 887 // have to persist it at all! 888 restored := tr.restoreHandle(taskHandle, tr.localState.DriverNetwork) 889 890 // If the handle could not be restored, the alloc is 891 // non-terminal, and the task isn't a system job: wait until 892 // servers have been contacted before running. #1795 893 if restored { 894 return nil 895 } 896 897 alloc := tr.Alloc() 898 if alloc.TerminalStatus() || alloc.Job.Type == structs.JobTypeSystem { 899 return nil 900 } 901 902 tr.logger.Trace("failed to reattach to task; will not run until server is contacted") 903 tr.waitOnServers = true 904 905 ev := structs.NewTaskEvent(structs.TaskRestoreFailed). 906 SetDisplayMessage("failed to restore task; will not run until server is contacted") 907 tr.UpdateState(structs.TaskStatePending, ev) 908 } 909 910 return nil 911 } 912 913 // restoreHandle ensures a TaskHandle is valid by calling Driver.RecoverTask 914 // and sets the driver handle. If the TaskHandle is not valid, DestroyTask is 915 // called. 916 func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers.DriverNetwork) (success bool) { 917 // Ensure handle is well-formed 918 if taskHandle.Config == nil { 919 return true 920 } 921 922 if err := tr.driver.RecoverTask(taskHandle); err != nil { 923 if tr.TaskState().State != structs.TaskStateRunning { 924 // RecoverTask should fail if the Task wasn't running 925 return true 926 } 927 928 tr.logger.Error("error recovering task; cleaning up", 929 "error", err, "task_id", taskHandle.Config.ID) 930 931 // Try to cleanup any existing task state in the plugin before restarting 932 if err := tr.driver.DestroyTask(taskHandle.Config.ID, true); err != nil { 933 // Ignore ErrTaskNotFound errors as ideally 934 // this task has already been stopped and 935 // therefore doesn't exist. 936 if err != drivers.ErrTaskNotFound { 937 tr.logger.Warn("error destroying unrecoverable task", 938 "error", err, "task_id", taskHandle.Config.ID) 939 } 940 941 return false 942 } 943 944 return true 945 } 946 947 // Update driver handle on task runner 948 tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net)) 949 return true 950 } 951 952 // UpdateState sets the task runners allocation state and triggers a server 953 // update. 954 func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) { 955 tr.stateLock.Lock() 956 defer tr.stateLock.Unlock() 957 958 if event != nil { 959 tr.logger.Trace("setting task state", "state", state, "event", event.Type) 960 961 // Append the event 962 tr.appendEvent(event) 963 } 964 965 // Update the state 966 if err := tr.updateStateImpl(state); err != nil { 967 // Only log the error as we persistence errors should not 968 // affect task state. 969 tr.logger.Error("error persisting task state", "error", err, "event", event, "state", state) 970 } 971 972 // Notify the alloc runner of the transition 973 tr.stateUpdater.TaskStateUpdated() 974 } 975 976 // updateStateImpl updates the in-memory task state and persists to disk. 977 func (tr *TaskRunner) updateStateImpl(state string) error { 978 979 // Update the task state 980 oldState := tr.state.State 981 taskState := tr.state 982 taskState.State = state 983 984 // Handle the state transition. 985 switch state { 986 case structs.TaskStateRunning: 987 // Capture the start time if it is just starting 988 if oldState != structs.TaskStateRunning { 989 taskState.StartedAt = time.Now().UTC() 990 if !tr.clientConfig.DisableTaggedMetrics { 991 metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels) 992 } 993 //if r.config.BackwardsCompatibleMetrics { 994 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) 995 //} 996 } 997 case structs.TaskStateDead: 998 // Capture the finished time if not already set 999 if taskState.FinishedAt.IsZero() { 1000 taskState.FinishedAt = time.Now().UTC() 1001 } 1002 1003 // Emitting metrics to indicate task complete and failures 1004 if taskState.Failed { 1005 if !tr.clientConfig.DisableTaggedMetrics { 1006 metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels) 1007 } 1008 //if r.config.BackwardsCompatibleMetrics { 1009 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1) 1010 //} 1011 } else { 1012 if !tr.clientConfig.DisableTaggedMetrics { 1013 metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, 1, tr.baseLabels) 1014 } 1015 //if r.config.BackwardsCompatibleMetrics { 1016 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1) 1017 //} 1018 } 1019 } 1020 1021 // Persist the state and event 1022 return tr.stateDB.PutTaskState(tr.allocID, tr.taskName, taskState) 1023 } 1024 1025 // EmitEvent appends a new TaskEvent to this task's TaskState. The actual 1026 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1027 // transition states. 1028 // Events are persisted locally and sent to the server, but errors are simply 1029 // logged. Use AppendEvent to simply add a new event. 1030 func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) { 1031 tr.stateLock.Lock() 1032 defer tr.stateLock.Unlock() 1033 1034 tr.appendEvent(event) 1035 1036 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1037 // Only a warning because the next event/state-transition will 1038 // try to persist it again. 1039 tr.logger.Warn("error persisting event", "error", err, "event", event) 1040 } 1041 1042 // Notify the alloc runner of the event 1043 tr.stateUpdater.TaskStateUpdated() 1044 } 1045 1046 // AppendEvent appends a new TaskEvent to this task's TaskState. The actual 1047 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1048 // transition states. 1049 // Events are persisted locally and errors are simply logged. Use EmitEvent 1050 // also update AllocRunner. 1051 func (tr *TaskRunner) AppendEvent(event *structs.TaskEvent) { 1052 tr.stateLock.Lock() 1053 defer tr.stateLock.Unlock() 1054 1055 tr.appendEvent(event) 1056 1057 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1058 // Only a warning because the next event/state-transition will 1059 // try to persist it again. 1060 tr.logger.Warn("error persisting event", "error", err, "event", event) 1061 } 1062 } 1063 1064 // appendEvent to task's event slice. Caller must acquire stateLock. 1065 func (tr *TaskRunner) appendEvent(event *structs.TaskEvent) error { 1066 // Ensure the event is populated with human readable strings 1067 event.PopulateEventDisplayMessage() 1068 1069 // Propagate failure from event to task state 1070 if event.FailsTask { 1071 tr.state.Failed = true 1072 } 1073 1074 // XXX This seems like a super awkward spot for this? Why not shouldRestart? 1075 // Update restart metrics 1076 if event.Type == structs.TaskRestarting { 1077 if !tr.clientConfig.DisableTaggedMetrics { 1078 metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels) 1079 } 1080 //if r.config.BackwardsCompatibleMetrics { 1081 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) 1082 //} 1083 tr.state.Restarts++ 1084 tr.state.LastRestart = time.Unix(0, event.Time) 1085 } 1086 1087 // Append event to slice 1088 appendTaskEvent(tr.state, event, tr.maxEvents) 1089 1090 return nil 1091 } 1092 1093 // WaitCh is closed when TaskRunner.Run exits. 1094 func (tr *TaskRunner) WaitCh() <-chan struct{} { 1095 return tr.waitCh 1096 } 1097 1098 // Update the running allocation with a new version received from the server. 1099 // Calls Update hooks asynchronously with Run. 1100 // 1101 // This method is safe for calling concurrently with Run and does not modify 1102 // the passed in allocation. 1103 func (tr *TaskRunner) Update(update *structs.Allocation) { 1104 task := update.LookupTask(tr.taskName) 1105 if task == nil { 1106 // This should not happen and likely indicates a bug in the 1107 // server or client. 1108 tr.logger.Error("allocation update is missing task; killing", 1109 "group", update.TaskGroup) 1110 te := structs.NewTaskEvent(structs.TaskKilled). 1111 SetKillReason("update missing task"). 1112 SetFailsTask() 1113 tr.Kill(context.Background(), te) 1114 return 1115 } 1116 1117 // Update tr.alloc 1118 tr.setAlloc(update, task) 1119 1120 // Trigger update hooks if not terminal 1121 if !update.TerminalStatus() { 1122 tr.triggerUpdateHooks() 1123 } 1124 } 1125 1126 // triggerUpdate if there isn't already an update pending. Should be called 1127 // instead of calling updateHooks directly to serialize runs of update hooks. 1128 // TaskRunner state should be updated prior to triggering update hooks. 1129 // 1130 // Does not block. 1131 func (tr *TaskRunner) triggerUpdateHooks() { 1132 select { 1133 case tr.triggerUpdateCh <- struct{}{}: 1134 default: 1135 // already an update hook pending 1136 } 1137 } 1138 1139 // Shutdown TaskRunner gracefully without affecting the state of the task. 1140 // Shutdown blocks until the main Run loop exits. 1141 func (tr *TaskRunner) Shutdown() { 1142 tr.logger.Trace("shutting down") 1143 tr.shutdownCtxCancel() 1144 1145 <-tr.WaitCh() 1146 1147 // Run shutdown hooks to cleanup 1148 tr.shutdownHooks() 1149 1150 // Persist once more 1151 tr.persistLocalState() 1152 } 1153 1154 // LatestResourceUsage returns the last resource utilization datapoint 1155 // collected. May return nil if the task is not running or no resource 1156 // utilization has been collected yet. 1157 func (tr *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1158 tr.resourceUsageLock.Lock() 1159 ru := tr.resourceUsage 1160 tr.resourceUsageLock.Unlock() 1161 1162 // Look up device statistics lazily when fetched, as currently we do not emit any stats for them yet 1163 if ru != nil && tr.deviceStatsReporter != nil { 1164 deviceResources := tr.taskResources.Devices 1165 ru.ResourceUsage.DeviceStats = tr.deviceStatsReporter.LatestDeviceResourceStats(deviceResources) 1166 } 1167 return ru 1168 } 1169 1170 // UpdateStats updates and emits the latest stats from the driver. 1171 func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) { 1172 tr.resourceUsageLock.Lock() 1173 tr.resourceUsage = ru 1174 tr.resourceUsageLock.Unlock() 1175 if ru != nil { 1176 tr.emitStats(ru) 1177 } 1178 } 1179 1180 //TODO Remove Backwardscompat or use tr.Alloc()? 1181 func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { 1182 alloc := tr.Alloc() 1183 var allocatedMem float32 1184 if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { 1185 // Convert to bytes to match other memory metrics 1186 allocatedMem = float32(taskRes.Memory.MemoryMB) * 1024 * 1024 1187 } 1188 1189 if !tr.clientConfig.DisableTaggedMetrics { 1190 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1191 float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels) 1192 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"}, 1193 float32(ru.ResourceUsage.MemoryStats.Cache), tr.baseLabels) 1194 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"}, 1195 float32(ru.ResourceUsage.MemoryStats.Swap), tr.baseLabels) 1196 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "usage"}, 1197 float32(ru.ResourceUsage.MemoryStats.Usage), tr.baseLabels) 1198 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"}, 1199 float32(ru.ResourceUsage.MemoryStats.MaxUsage), tr.baseLabels) 1200 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"}, 1201 float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels) 1202 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, 1203 float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels) 1204 if allocatedMem > 0 { 1205 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "allocated"}, 1206 allocatedMem, tr.baseLabels) 1207 } 1208 } 1209 1210 if tr.clientConfig.BackwardsCompatibleMetrics { 1211 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1212 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1213 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1214 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage)) 1215 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1216 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1217 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1218 if allocatedMem > 0 { 1219 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "allocated"}, allocatedMem) 1220 } 1221 } 1222 } 1223 1224 //TODO Remove Backwardscompat or use tr.Alloc()? 1225 func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) { 1226 if !tr.clientConfig.DisableTaggedMetrics { 1227 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"}, 1228 float32(ru.ResourceUsage.CpuStats.Percent), tr.baseLabels) 1229 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"}, 1230 float32(ru.ResourceUsage.CpuStats.SystemMode), tr.baseLabels) 1231 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"}, 1232 float32(ru.ResourceUsage.CpuStats.UserMode), tr.baseLabels) 1233 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"}, 1234 float32(ru.ResourceUsage.CpuStats.ThrottledTime), tr.baseLabels) 1235 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"}, 1236 float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels) 1237 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"}, 1238 float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels) 1239 } 1240 1241 if tr.clientConfig.BackwardsCompatibleMetrics { 1242 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1243 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1244 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1245 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1246 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1247 metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1248 } 1249 } 1250 1251 // emitStats emits resource usage stats of tasks to remote metrics collector 1252 // sinks 1253 func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1254 if !tr.clientConfig.PublishAllocationMetrics { 1255 return 1256 } 1257 1258 if ru.ResourceUsage.MemoryStats != nil { 1259 tr.setGaugeForMemory(ru) 1260 } 1261 1262 if ru.ResourceUsage.CpuStats != nil { 1263 tr.setGaugeForCPU(ru) 1264 } 1265 } 1266 1267 // appendTaskEvent updates the task status by appending the new event. 1268 func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent, capacity int) { 1269 if state.Events == nil { 1270 state.Events = make([]*structs.TaskEvent, 1, capacity) 1271 state.Events[0] = event 1272 return 1273 } 1274 1275 // If we hit capacity, then shift it. 1276 if len(state.Events) == capacity { 1277 old := state.Events 1278 state.Events = make([]*structs.TaskEvent, 0, capacity) 1279 state.Events = append(state.Events, old[1:]...) 1280 } 1281 1282 state.Events = append(state.Events, event) 1283 } 1284 1285 func (tr *TaskRunner) TaskExecHandler() drivermanager.TaskExecHandler { 1286 return tr.getDriverHandle().ExecStreaming 1287 } 1288 1289 func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) { 1290 return tr.driver.Capabilities() 1291 }