github.com/manicqin/nomad@v0.9.5/client/allocrunner/taskrunner/task_runner.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "strings" 8 "sync" 9 "time" 10 11 metrics "github.com/armon/go-metrics" 12 log "github.com/hashicorp/go-hclog" 13 multierror "github.com/hashicorp/go-multierror" 14 "github.com/hashicorp/hcl2/hcldec" 15 "github.com/hashicorp/nomad/client/allocdir" 16 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 17 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts" 18 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" 19 "github.com/hashicorp/nomad/client/config" 20 "github.com/hashicorp/nomad/client/consul" 21 "github.com/hashicorp/nomad/client/devicemanager" 22 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 23 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 24 cstate "github.com/hashicorp/nomad/client/state" 25 cstructs "github.com/hashicorp/nomad/client/structs" 26 "github.com/hashicorp/nomad/client/taskenv" 27 "github.com/hashicorp/nomad/client/vaultclient" 28 "github.com/hashicorp/nomad/helper/pluginutils/hclspecutils" 29 "github.com/hashicorp/nomad/helper/pluginutils/hclutils" 30 "github.com/hashicorp/nomad/helper/uuid" 31 "github.com/hashicorp/nomad/nomad/structs" 32 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 33 "github.com/hashicorp/nomad/plugins/drivers" 34 ) 35 36 const ( 37 // defaultMaxEvents is the default max capacity for task events on the 38 // task state. Overrideable for testing. 39 defaultMaxEvents = 10 40 41 // killBackoffBaseline is the baseline time for exponential backoff while 42 // killing a task. 43 killBackoffBaseline = 5 * time.Second 44 45 // killBackoffLimit is the limit of the exponential backoff for killing 46 // the task. 47 killBackoffLimit = 2 * time.Minute 48 49 // killFailureLimit is how many times we will attempt to kill a task before 50 // giving up and potentially leaking resources. 51 killFailureLimit = 5 52 53 // triggerUpdatechCap is the capacity for the triggerUpdateCh used for 54 // triggering updates. It should be exactly 1 as even if multiple 55 // updates have come in since the last one was handled, we only need to 56 // handle the last one. 57 triggerUpdateChCap = 1 58 ) 59 60 type TaskRunner struct { 61 // allocID, taskName, taskLeader, and taskResources are immutable so these fields may 62 // be accessed without locks 63 allocID string 64 taskName string 65 taskLeader bool 66 taskResources *structs.AllocatedTaskResources 67 68 alloc *structs.Allocation 69 allocLock sync.Mutex 70 71 clientConfig *config.Config 72 73 // stateUpdater is used to emit updated task state 74 stateUpdater interfaces.TaskStateHandler 75 76 // state captures the state of the task for updating the allocation 77 // Must acquire stateLock to access. 78 state *structs.TaskState 79 80 // localState captures the node-local state of the task for when the 81 // Nomad agent restarts. 82 // Must acquire stateLock to access. 83 localState *state.LocalState 84 85 // stateLock must be acquired when accessing state or localState. 86 stateLock sync.RWMutex 87 88 // stateDB is for persisting localState and taskState 89 stateDB cstate.StateDB 90 91 // shutdownCtx is used to exit the TaskRunner *without* affecting task state. 92 shutdownCtx context.Context 93 94 // shutdownCtxCancel causes the TaskRunner to exit immediately without 95 // affecting task state. Useful for testing or graceful agent shutdown. 96 shutdownCtxCancel context.CancelFunc 97 98 // killCtx is the task runner's context representing the tasks's lifecycle. 99 // The context is canceled when the task is killed. 100 killCtx context.Context 101 102 // killCtxCancel is called when killing a task. 103 killCtxCancel context.CancelFunc 104 105 // killErr is populated when killing a task. Access should be done use the 106 // getter/setter 107 killErr error 108 killErrLock sync.Mutex 109 110 // Logger is the logger for the task runner. 111 logger log.Logger 112 113 // triggerUpdateCh is ticked whenever update hooks need to be run and 114 // must be created with cap=1 to signal a pending update and prevent 115 // callers from deadlocking if the receiver has exited. 116 triggerUpdateCh chan struct{} 117 118 // waitCh is closed when the task runner has transitioned to a terminal 119 // state 120 waitCh chan struct{} 121 122 // driver is the driver for the task. 123 driver drivers.DriverPlugin 124 125 // driverCapabilities is the set capabilities the driver supports 126 driverCapabilities *drivers.Capabilities 127 128 // taskSchema is the hcl spec for the task driver configuration 129 taskSchema hcldec.Spec 130 131 // handleLock guards access to handle and handleResult 132 handleLock sync.Mutex 133 134 // handle to the running driver 135 handle *DriverHandle 136 137 // task is the task being run 138 task *structs.Task 139 taskLock sync.RWMutex 140 141 // taskDir is the directory structure for this task. 142 taskDir *allocdir.TaskDir 143 144 // envBuilder is used to build the task's environment 145 envBuilder *taskenv.Builder 146 147 // restartTracker is used to decide if the task should be restarted. 148 restartTracker *restarts.RestartTracker 149 150 // runnerHooks are task runner lifecycle hooks that should be run on state 151 // transistions. 152 runnerHooks []interfaces.TaskHook 153 154 // hookResources captures the resources provided by hooks 155 hookResources *hookResources 156 157 // consulClient is the client used by the consul service hook for 158 // registering services and checks 159 consulClient consul.ConsulServiceAPI 160 161 // vaultClient is the client to use to derive and renew Vault tokens 162 vaultClient vaultclient.VaultClient 163 164 // vaultToken is the current Vault token. It should be accessed with the 165 // getter. 166 vaultToken string 167 vaultTokenLock sync.Mutex 168 169 // baseLabels are used when emitting tagged metrics. All task runner metrics 170 // will have these tags, and optionally more. 171 baseLabels []metrics.Label 172 173 // logmonHookConfig is used to get the paths to the stdout and stderr fifos 174 // to be passed to the driver for task logging 175 logmonHookConfig *logmonHookConfig 176 177 // resourceUsage is written via UpdateStats and read via 178 // LatestResourceUsage. May be nil at all times. 179 resourceUsage *cstructs.TaskResourceUsage 180 resourceUsageLock sync.Mutex 181 182 // deviceStatsReporter is used to lookup resource usage for alloc devices 183 deviceStatsReporter cinterfaces.DeviceStatsReporter 184 185 // devicemanager is used to mount devices as well as lookup device 186 // statistics 187 devicemanager devicemanager.Manager 188 189 // driverManager is used to dispense driver plugins and register event 190 // handlers 191 driverManager drivermanager.Manager 192 193 // maxEvents is the capacity of the TaskEvents on the TaskState. 194 // Defaults to defaultMaxEvents but overrideable for testing. 195 maxEvents int 196 197 // serversContactedCh is passed to TaskRunners so they can detect when 198 // GetClientAllocs has been called in case of a failed restore. 199 serversContactedCh <-chan struct{} 200 201 // waitOnServers defaults to false but will be set true if a restore 202 // fails and the Run method should wait until serversContactedCh is 203 // closed. 204 waitOnServers bool 205 206 networkIsolationLock sync.Mutex 207 networkIsolationSpec *drivers.NetworkIsolationSpec 208 } 209 210 type Config struct { 211 Alloc *structs.Allocation 212 ClientConfig *config.Config 213 Consul consul.ConsulServiceAPI 214 Task *structs.Task 215 TaskDir *allocdir.TaskDir 216 Logger log.Logger 217 218 // Vault is the client to use to derive and renew Vault tokens 219 Vault vaultclient.VaultClient 220 221 // StateDB is used to store and restore state. 222 StateDB cstate.StateDB 223 224 // StateUpdater is used to emit updated task state 225 StateUpdater interfaces.TaskStateHandler 226 227 // deviceStatsReporter is used to lookup resource usage for alloc devices 228 DeviceStatsReporter cinterfaces.DeviceStatsReporter 229 230 // DeviceManager is used to mount devices as well as lookup device 231 // statistics 232 DeviceManager devicemanager.Manager 233 234 // DriverManager is used to dispense driver plugins and register event 235 // handlers 236 DriverManager drivermanager.Manager 237 238 // ServersContactedCh is closed when the first GetClientAllocs call to 239 // servers succeeds and allocs are synced. 240 ServersContactedCh chan struct{} 241 } 242 243 func NewTaskRunner(config *Config) (*TaskRunner, error) { 244 // Create a context for causing the runner to exit 245 trCtx, trCancel := context.WithCancel(context.Background()) 246 247 // Create a context for killing the runner 248 killCtx, killCancel := context.WithCancel(context.Background()) 249 250 // Initialize the environment builder 251 envBuilder := taskenv.NewBuilder( 252 config.ClientConfig.Node, 253 config.Alloc, 254 config.Task, 255 config.ClientConfig.Region, 256 ) 257 258 // Initialize state from alloc if it is set 259 tstate := structs.NewTaskState() 260 if ts := config.Alloc.TaskStates[config.Task.Name]; ts != nil { 261 tstate = ts.Copy() 262 } 263 264 tr := &TaskRunner{ 265 alloc: config.Alloc, 266 allocID: config.Alloc.ID, 267 clientConfig: config.ClientConfig, 268 task: config.Task, 269 taskDir: config.TaskDir, 270 taskName: config.Task.Name, 271 taskLeader: config.Task.Leader, 272 envBuilder: envBuilder, 273 consulClient: config.Consul, 274 vaultClient: config.Vault, 275 state: tstate, 276 localState: state.NewLocalState(), 277 stateDB: config.StateDB, 278 stateUpdater: config.StateUpdater, 279 deviceStatsReporter: config.DeviceStatsReporter, 280 killCtx: killCtx, 281 killCtxCancel: killCancel, 282 shutdownCtx: trCtx, 283 shutdownCtxCancel: trCancel, 284 triggerUpdateCh: make(chan struct{}, triggerUpdateChCap), 285 waitCh: make(chan struct{}), 286 devicemanager: config.DeviceManager, 287 driverManager: config.DriverManager, 288 maxEvents: defaultMaxEvents, 289 serversContactedCh: config.ServersContactedCh, 290 } 291 292 // Create the logger based on the allocation ID 293 tr.logger = config.Logger.Named("task_runner").With("task", config.Task.Name) 294 295 // Pull out the task's resources 296 ares := tr.alloc.AllocatedResources 297 if ares != nil { 298 tres, ok := ares.Tasks[tr.taskName] 299 if !ok { 300 return nil, fmt.Errorf("no task resources found on allocation") 301 } 302 tr.taskResources = tres 303 } else { 304 // COMPAT(0.11): Upgrade from 0.8 resources to 0.9+ resources 305 // Grab the old task resources 306 oldTr, ok := tr.alloc.TaskResources[tr.taskName] 307 if !ok { 308 return nil, fmt.Errorf("no task resources found on allocation") 309 } 310 311 // Convert the old to new 312 tr.taskResources = &structs.AllocatedTaskResources{ 313 Cpu: structs.AllocatedCpuResources{ 314 CpuShares: int64(oldTr.CPU), 315 }, 316 Memory: structs.AllocatedMemoryResources{ 317 MemoryMB: int64(oldTr.MemoryMB), 318 }, 319 Networks: oldTr.Networks, 320 } 321 } 322 323 // Build the restart tracker. 324 tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup) 325 if tg == nil { 326 tr.logger.Error("alloc missing task group") 327 return nil, fmt.Errorf("alloc missing task group") 328 } 329 tr.restartTracker = restarts.NewRestartTracker(tg.RestartPolicy, tr.alloc.Job.Type) 330 331 // Get the driver 332 if err := tr.initDriver(); err != nil { 333 tr.logger.Error("failed to create driver", "error", err) 334 return nil, err 335 } 336 337 // Initialize the runners hooks. 338 tr.initHooks() 339 340 // Initialize base labels 341 tr.initLabels() 342 343 // Initialize initial task received event 344 tr.appendEvent(structs.NewTaskEvent(structs.TaskReceived)) 345 346 return tr, nil 347 } 348 349 func (tr *TaskRunner) initLabels() { 350 alloc := tr.Alloc() 351 tr.baseLabels = []metrics.Label{ 352 { 353 Name: "job_name", 354 Value: alloc.Job.Name, 355 }, 356 { 357 Name: "task_group", 358 Value: alloc.TaskGroup, 359 }, 360 { 361 Name: "alloc_id", 362 Value: tr.allocID, 363 }, 364 { 365 Name: "task", 366 Value: tr.taskName, 367 }, 368 { 369 Name: "namespace", 370 Value: tr.alloc.Namespace, 371 }, 372 } 373 374 if tr.alloc.Job.ParentID != "" { 375 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 376 Name: "parent_id", 377 Value: tr.alloc.Job.ParentID, 378 }) 379 if strings.Contains(tr.alloc.Job.Name, "/dispatch-") { 380 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 381 Name: "dispatch_id", 382 Value: strings.Split(tr.alloc.Job.Name, "/dispatch-")[1], 383 }) 384 } 385 if strings.Contains(tr.alloc.Job.Name, "/periodic-") { 386 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 387 Name: "periodic_id", 388 Value: strings.Split(tr.alloc.Job.Name, "/periodic-")[1], 389 }) 390 } 391 } 392 } 393 394 // Mark a task as failed and not to run. Aimed to be invoked when alloc runner 395 // prestart hooks failed. 396 // Should never be called with Run(). 397 func (tr *TaskRunner) MarkFailedDead(reason string) { 398 defer close(tr.waitCh) 399 400 tr.stateLock.Lock() 401 if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { 402 //TODO Nomad will be unable to restore this task; try to kill 403 // it now and fail? In general we prefer to leave running 404 // tasks running even if the agent encounters an error. 405 tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart", 406 "error", err) 407 } 408 tr.stateLock.Unlock() 409 410 event := structs.NewTaskEvent(structs.TaskSetupFailure). 411 SetDisplayMessage(reason). 412 SetFailsTask() 413 tr.UpdateState(structs.TaskStateDead, event) 414 415 // Run the stop hooks in case task was a restored task that failed prestart 416 if err := tr.stop(); err != nil { 417 tr.logger.Error("stop failed while marking task dead", "error", err) 418 } 419 } 420 421 // Run the TaskRunner. Starts the user's task or reattaches to a restored task. 422 // Run closes WaitCh when it exits. Should be started in a goroutine. 423 func (tr *TaskRunner) Run() { 424 defer close(tr.waitCh) 425 var result *drivers.ExitResult 426 427 tr.stateLock.RLock() 428 dead := tr.state.State == structs.TaskStateDead 429 tr.stateLock.RUnlock() 430 431 // if restoring a dead task, ensure that task is cleared and all post hooks 432 // are called without additional state updates 433 if dead { 434 // do cleanup functions without emitting any additional events/work 435 // to handle cases where we restored a dead task where client terminated 436 // after task finished before completing post-run actions. 437 tr.clearDriverHandle() 438 tr.stateUpdater.TaskStateUpdated() 439 if err := tr.stop(); err != nil { 440 tr.logger.Error("stop failed on terminal task", "error", err) 441 } 442 return 443 } 444 445 // Updates are handled asynchronously with the other hooks but each 446 // triggered update - whether due to alloc updates or a new vault token 447 // - should be handled serially. 448 go tr.handleUpdates() 449 450 // If restore failed wait until servers are contacted before running. 451 // #1795 452 if tr.waitOnServers { 453 tr.logger.Info("task failed to restore; waiting to contact server before restarting") 454 select { 455 case <-tr.killCtx.Done(): 456 case <-tr.shutdownCtx.Done(): 457 return 458 case <-tr.serversContactedCh: 459 tr.logger.Info("server contacted; unblocking waiting task") 460 } 461 } 462 463 MAIN: 464 for !tr.Alloc().TerminalStatus() { 465 select { 466 case <-tr.killCtx.Done(): 467 break MAIN 468 case <-tr.shutdownCtx.Done(): 469 // TaskRunner was told to exit immediately 470 return 471 default: 472 } 473 474 // Run the prestart hooks 475 if err := tr.prestart(); err != nil { 476 tr.logger.Error("prestart failed", "error", err) 477 tr.restartTracker.SetStartError(err) 478 goto RESTART 479 } 480 481 select { 482 case <-tr.killCtx.Done(): 483 break MAIN 484 case <-tr.shutdownCtx.Done(): 485 // TaskRunner was told to exit immediately 486 return 487 default: 488 } 489 490 // Run the task 491 if err := tr.runDriver(); err != nil { 492 tr.logger.Error("running driver failed", "error", err) 493 tr.restartTracker.SetStartError(err) 494 goto RESTART 495 } 496 497 // Run the poststart hooks 498 if err := tr.poststart(); err != nil { 499 tr.logger.Error("poststart failed", "error", err) 500 } 501 502 // Grab the result proxy and wait for task to exit 503 WAIT: 504 { 505 handle := tr.getDriverHandle() 506 result = nil 507 508 // Do *not* use tr.killCtx here as it would cause 509 // Wait() to unblock before the task exits when Kill() 510 // is called. 511 if resultCh, err := handle.WaitCh(context.Background()); err != nil { 512 tr.logger.Error("wait task failed", "error", err) 513 // Set a timer if timeout was specified, and add a new case if the timer elapsed 514 } else if tr.task.Timeout > 0 { 515 timer := time.NewTimer(tr.task.Timeout * time.Second) 516 select { 517 case <-tr.killCtx.Done(): 518 // We can go through the normal should restart check since 519 // the restart tracker knowns it is killed 520 result = tr.handleKill() 521 case <-tr.shutdownCtx.Done(): 522 // TaskRunner was told to exit immediately 523 return 524 case <-timer.C: 525 result = tr.handleTimeout() 526 case result = <-resultCh: 527 } 528 529 // WaitCh returned a result 530 if retryWait := tr.handleTaskExitResult(result); retryWait { 531 goto WAIT 532 } 533 } else { 534 select { 535 case <-tr.killCtx.Done(): 536 // We can go through the normal should restart check since 537 // the restart tracker knowns it is killed 538 result = tr.handleKill() 539 case <-tr.shutdownCtx.Done(): 540 // TaskRunner was told to exit immediately 541 return 542 case result = <-resultCh: 543 } 544 545 // WaitCh returned a result 546 if retryWait := tr.handleTaskExitResult(result); retryWait { 547 goto WAIT 548 } 549 } 550 } 551 552 // Clear the handle 553 tr.clearDriverHandle() 554 555 // Store the wait result on the restart tracker 556 tr.restartTracker.SetExitResult(result) 557 558 if err := tr.exited(); err != nil { 559 tr.logger.Error("exited hooks failed", "error", err) 560 } 561 562 RESTART: 563 restart, restartDelay := tr.shouldRestart() 564 if !restart { 565 break MAIN 566 } 567 568 // Actually restart by sleeping and also watching for destroy events 569 select { 570 case <-time.After(restartDelay): 571 case <-tr.killCtx.Done(): 572 tr.logger.Trace("task killed between restarts", "delay", restartDelay) 573 break MAIN 574 case <-tr.shutdownCtx.Done(): 575 // TaskRunner was told to exit immediately 576 tr.logger.Trace("gracefully shutting down during restart delay") 577 return 578 } 579 } 580 581 // Ensure handle is cleaned up. Restore could have recovered a task 582 // that should be terminal, so if the handle still exists we should 583 // kill it here. 584 if tr.getDriverHandle() != nil { 585 if result = tr.handleKill(); result != nil { 586 tr.emitExitResultEvent(result) 587 } 588 589 tr.clearDriverHandle() 590 591 if err := tr.exited(); err != nil { 592 tr.logger.Error("exited hooks failed while cleaning up terminal task", "error", err) 593 } 594 } 595 596 // Mark the task as dead 597 tr.UpdateState(structs.TaskStateDead, nil) 598 599 // Run the stop hooks 600 if err := tr.stop(); err != nil { 601 tr.logger.Error("stop failed", "error", err) 602 } 603 604 tr.logger.Debug("task run loop exiting") 605 } 606 607 // handleTaskExitResult handles the results returned by the task exiting. If 608 // retryWait is true, the caller should attempt to wait on the task again since 609 // it has not actually finished running. This can happen if the driver plugin 610 // has exited. 611 func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWait bool) { 612 if result == nil { 613 return false 614 } 615 616 if result.Err == bstructs.ErrPluginShutdown { 617 dn := tr.Task().Driver 618 tr.logger.Debug("driver plugin has shutdown; attempting to recover task", "driver", dn) 619 620 // Initialize a new driver handle 621 if err := tr.initDriver(); err != nil { 622 tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err, "driver", dn) 623 return false 624 } 625 626 // Try to restore the handle 627 tr.stateLock.RLock() 628 h := tr.localState.TaskHandle 629 net := tr.localState.DriverNetwork 630 tr.stateLock.RUnlock() 631 if !tr.restoreHandle(h, net) { 632 tr.logger.Error("failed to restore handle on driver after it exited unexpectedly", "driver", dn) 633 return false 634 } 635 636 tr.logger.Debug("task successfully recovered on driver", "driver", dn) 637 return true 638 } 639 640 // Emit Terminated event 641 tr.emitExitResultEvent(result) 642 643 return false 644 } 645 646 // emitExitResultEvent emits a TaskTerminated event for an ExitResult. 647 func (tr *TaskRunner) emitExitResultEvent(result *drivers.ExitResult) { 648 event := structs.NewTaskEvent(structs.TaskTerminated). 649 SetExitCode(result.ExitCode). 650 SetSignal(result.Signal). 651 SetOOMKilled(result.OOMKilled). 652 SetExitMessage(result.Err). 653 SetTimeout(result.TimedOut) 654 655 tr.EmitEvent(event) 656 657 if result.OOMKilled && !tr.clientConfig.DisableTaggedMetrics { 658 metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels) 659 } 660 } 661 662 // handleUpdates runs update hooks when triggerUpdateCh is ticked and exits 663 // when Run has returned. Should only be run in a goroutine from Run. 664 func (tr *TaskRunner) handleUpdates() { 665 for { 666 select { 667 case <-tr.triggerUpdateCh: 668 case <-tr.waitCh: 669 return 670 } 671 672 // Non-terminal update; run hooks 673 tr.updateHooks() 674 } 675 } 676 677 // shouldRestart determines whether the task should be restarted and updates 678 // the task state unless the task is killed or terminated. 679 func (tr *TaskRunner) shouldRestart() (bool, time.Duration) { 680 // Determine if we should restart 681 state, when := tr.restartTracker.GetState() 682 reason := tr.restartTracker.GetReason() 683 switch state { 684 case structs.TaskKilled: 685 // Never restart an explicitly killed task. Kill method handles 686 // updating the server. 687 tr.EmitEvent(structs.NewTaskEvent(state)) 688 return false, 0 689 case structs.TaskNotRestarting, structs.TaskTerminated: 690 tr.logger.Info("not restarting task", "reason", reason) 691 if state == structs.TaskNotRestarting { 692 tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting).SetRestartReason(reason).SetFailsTask()) 693 } 694 return false, 0 695 case structs.TaskRestarting: 696 tr.logger.Info("restarting task", "reason", reason, "delay", when) 697 tr.UpdateState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when).SetRestartReason(reason)) 698 return true, when 699 default: 700 tr.logger.Error("restart tracker returned unknown state", "state", state) 701 return true, when 702 } 703 } 704 705 // runDriver runs the driver and waits for it to exit 706 // runDriver emits an appropriate task event on success/failure 707 func (tr *TaskRunner) runDriver() error { 708 709 taskConfig := tr.buildTaskConfig() 710 711 // Build hcl context variables 712 vars, errs, err := tr.envBuilder.Build().AllValues() 713 if err != nil { 714 return fmt.Errorf("error building environment variables: %v", err) 715 } 716 717 // Handle per-key errors 718 if len(errs) > 0 { 719 keys := make([]string, 0, len(errs)) 720 for k, err := range errs { 721 keys = append(keys, k) 722 723 if tr.logger.IsTrace() { 724 // Verbosely log every diagnostic for debugging 725 tr.logger.Trace("error building environment variables", "key", k, "error", err) 726 } 727 } 728 729 tr.logger.Warn("some environment variables not available for rendering", "keys", strings.Join(keys, ", ")) 730 } 731 732 val, diag, diagErrs := hclutils.ParseHclInterface(tr.task.Config, tr.taskSchema, vars) 733 if diag.HasErrors() { 734 parseErr := multierror.Append(errors.New("failed to parse config: "), diagErrs...) 735 tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(parseErr)) 736 return parseErr 737 } 738 739 if err := taskConfig.EncodeDriverConfig(val); err != nil { 740 encodeErr := fmt.Errorf("failed to encode driver config: %v", err) 741 tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(encodeErr)) 742 return encodeErr 743 } 744 745 // If there's already a task handle (eg from a Restore) there's nothing 746 // to do except update state. 747 if tr.getDriverHandle() != nil { 748 // Ensure running state is persisted but do *not* append a new 749 // task event as restoring is a client event and not relevant 750 // to a task's lifecycle. 751 if err := tr.updateStateImpl(structs.TaskStateRunning); err != nil { 752 //TODO return error and destroy task to avoid an orphaned task? 753 tr.logger.Warn("error persisting task state", "error", err) 754 } 755 return nil 756 } 757 758 // Start the job if there's no existing handle (or if RecoverTask failed) 759 handle, net, err := tr.driver.StartTask(taskConfig) 760 if err != nil { 761 // The plugin has died, try relaunching it 762 if err == bstructs.ErrPluginShutdown { 763 tr.logger.Info("failed to start task because plugin shutdown unexpectedly; attempting to recover") 764 if err := tr.initDriver(); err != nil { 765 taskErr := fmt.Errorf("failed to initialize driver after it exited unexpectedly: %v", err) 766 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr)) 767 return taskErr 768 } 769 770 handle, net, err = tr.driver.StartTask(taskConfig) 771 if err != nil { 772 taskErr := fmt.Errorf("failed to start task after driver exited unexpectedly: %v", err) 773 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr)) 774 return taskErr 775 } 776 } else { 777 // Do *NOT* wrap the error here without maintaining whether or not is Recoverable. 778 // You must emit a task event failure to be considered Recoverable 779 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err)) 780 return err 781 } 782 } 783 784 tr.stateLock.Lock() 785 tr.localState.TaskHandle = handle 786 tr.localState.DriverNetwork = net 787 if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { 788 //TODO Nomad will be unable to restore this task; try to kill 789 // it now and fail? In general we prefer to leave running 790 // tasks running even if the agent encounters an error. 791 tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart", 792 "error", err, "task_id", handle.Config.ID) 793 } 794 tr.stateLock.Unlock() 795 796 tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) 797 798 // Emit an event that we started 799 tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 800 return nil 801 } 802 803 // initDriver retrives the DriverPlugin from the plugin loader for this task 804 func (tr *TaskRunner) initDriver() error { 805 driver, err := tr.driverManager.Dispense(tr.Task().Driver) 806 if err != nil { 807 return err 808 } 809 tr.driver = driver 810 811 schema, err := tr.driver.TaskConfigSchema() 812 if err != nil { 813 return err 814 } 815 spec, diag := hclspecutils.Convert(schema) 816 if diag.HasErrors() { 817 return multierror.Append(errors.New("failed to convert task schema"), diag.Errs()...) 818 } 819 tr.taskSchema = spec 820 821 caps, err := tr.driver.Capabilities() 822 if err != nil { 823 return err 824 } 825 tr.driverCapabilities = caps 826 827 return nil 828 } 829 830 func (tr *TaskRunner) handleTimeout() *drivers.ExitResult { 831 event := structs.NewTaskEvent(structs.TaskKilling). 832 SetKillReason("Timeout") 833 834 tr.EmitEvent(event) 835 result := tr.handleKill() 836 result.ExitCode = 1 837 result.TimedOut = true 838 839 return result 840 } 841 842 // handleKill is used to handle the a request to kill a task. It will return 843 // the handle exit result if one is available and store any error in the task 844 // runner killErr value. 845 func (tr *TaskRunner) handleKill() *drivers.ExitResult { 846 // Run the pre killing hooks 847 tr.preKill() 848 849 // Tell the restart tracker that the task has been killed so it doesn't 850 // attempt to restart it. 851 tr.restartTracker.SetKilled() 852 853 // Check it is running 854 handle := tr.getDriverHandle() 855 if handle == nil { 856 return nil 857 } 858 859 // Kill the task using an exponential backoff in-case of failures. 860 killErr := tr.killTask(handle) 861 if killErr != nil { 862 // We couldn't successfully destroy the resource created. 863 tr.logger.Error("failed to kill task. Resources may have been leaked", "error", killErr) 864 tr.setKillErr(killErr) 865 } 866 867 // Block until task has exited. 868 waitCh, err := handle.WaitCh(tr.shutdownCtx) 869 870 // The error should be nil or TaskNotFound, if it's something else then a 871 // failure in the driver or transport layer occurred 872 if err != nil { 873 if err == drivers.ErrTaskNotFound { 874 return nil 875 } 876 tr.logger.Error("failed to wait on task. Resources may have been leaked", "error", err) 877 tr.setKillErr(killErr) 878 return nil 879 } 880 881 select { 882 case result := <-waitCh: 883 return result 884 case <-tr.shutdownCtx.Done(): 885 return nil 886 } 887 } 888 889 // killTask kills the task handle. In the case that killing fails, 890 // killTask will retry with an exponential backoff and will give up at a 891 // given limit. Returns an error if the task could not be killed. 892 func (tr *TaskRunner) killTask(handle *DriverHandle) error { 893 // Cap the number of times we attempt to kill the task. 894 var err error 895 for i := 0; i < killFailureLimit; i++ { 896 if err = handle.Kill(); err != nil { 897 if err == drivers.ErrTaskNotFound { 898 tr.logger.Warn("couldn't find task to kill", "task_id", handle.ID()) 899 return nil 900 } 901 // Calculate the new backoff 902 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 903 if backoff > killBackoffLimit { 904 backoff = killBackoffLimit 905 } 906 907 tr.logger.Error("failed to kill task", "backoff", backoff, "error", err) 908 time.Sleep(backoff) 909 } else { 910 // Kill was successful 911 return nil 912 } 913 } 914 return err 915 } 916 917 // persistLocalState persists local state to disk synchronously. 918 func (tr *TaskRunner) persistLocalState() error { 919 tr.stateLock.RLock() 920 defer tr.stateLock.RUnlock() 921 922 return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState) 923 } 924 925 // buildTaskConfig builds a drivers.TaskConfig with an unique ID for the task. 926 // The ID is unique for every invocation, it is built from the alloc ID, task 927 // name and 8 random characters. 928 func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig { 929 task := tr.Task() 930 alloc := tr.Alloc() 931 invocationid := uuid.Generate()[:8] 932 taskResources := tr.taskResources 933 env := tr.envBuilder.Build() 934 tr.networkIsolationLock.Lock() 935 defer tr.networkIsolationLock.Unlock() 936 937 return &drivers.TaskConfig{ 938 ID: fmt.Sprintf("%s/%s/%s", alloc.ID, task.Name, invocationid), 939 Name: task.Name, 940 JobName: alloc.Job.Name, 941 TaskGroupName: alloc.TaskGroup, 942 Resources: &drivers.Resources{ 943 NomadResources: taskResources, 944 LinuxResources: &drivers.LinuxResources{ 945 MemoryLimitBytes: taskResources.Memory.MemoryMB * 1024 * 1024, 946 CPUShares: taskResources.Cpu.CpuShares, 947 PercentTicks: float64(taskResources.Cpu.CpuShares) / float64(tr.clientConfig.Node.NodeResources.Cpu.CpuShares), 948 }, 949 }, 950 Devices: tr.hookResources.getDevices(), 951 Mounts: tr.hookResources.getMounts(), 952 Env: env.Map(), 953 DeviceEnv: env.DeviceEnv(), 954 User: task.User, 955 AllocDir: tr.taskDir.AllocDir, 956 StdoutPath: tr.logmonHookConfig.stdoutFifo, 957 StderrPath: tr.logmonHookConfig.stderrFifo, 958 AllocID: tr.allocID, 959 NetworkIsolation: tr.networkIsolationSpec, 960 } 961 } 962 963 // Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner 964 // but before Run so no locks need to be acquired. 965 func (tr *TaskRunner) Restore() error { 966 ls, ts, err := tr.stateDB.GetTaskRunnerState(tr.allocID, tr.taskName) 967 if err != nil { 968 return err 969 } 970 971 if ls != nil { 972 ls.Canonicalize() 973 tr.localState = ls 974 } 975 976 if ts != nil { 977 ts.Canonicalize() 978 tr.state = ts 979 } 980 981 // If a TaskHandle was persisted, ensure it is valid or destroy it. 982 if taskHandle := tr.localState.TaskHandle; taskHandle != nil { 983 //TODO if RecoverTask returned the DriverNetwork we wouldn't 984 // have to persist it at all! 985 restored := tr.restoreHandle(taskHandle, tr.localState.DriverNetwork) 986 987 // If the handle could not be restored, the alloc is 988 // non-terminal, and the task isn't a system job: wait until 989 // servers have been contacted before running. #1795 990 if restored { 991 return nil 992 } 993 994 alloc := tr.Alloc() 995 if tr.state.State == structs.TaskStateDead || alloc.TerminalStatus() || alloc.Job.Type == structs.JobTypeSystem { 996 return nil 997 } 998 999 tr.logger.Trace("failed to reattach to task; will not run until server is contacted") 1000 tr.waitOnServers = true 1001 1002 ev := structs.NewTaskEvent(structs.TaskRestoreFailed). 1003 SetDisplayMessage("failed to restore task; will not run until server is contacted") 1004 tr.UpdateState(structs.TaskStatePending, ev) 1005 } 1006 1007 return nil 1008 } 1009 1010 // restoreHandle ensures a TaskHandle is valid by calling Driver.RecoverTask 1011 // and sets the driver handle. If the TaskHandle is not valid, DestroyTask is 1012 // called. 1013 func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers.DriverNetwork) (success bool) { 1014 // Ensure handle is well-formed 1015 if taskHandle.Config == nil { 1016 return true 1017 } 1018 1019 if err := tr.driver.RecoverTask(taskHandle); err != nil { 1020 if tr.TaskState().State != structs.TaskStateRunning { 1021 // RecoverTask should fail if the Task wasn't running 1022 return true 1023 } 1024 1025 tr.logger.Error("error recovering task; cleaning up", 1026 "error", err, "task_id", taskHandle.Config.ID) 1027 1028 // Try to cleanup any existing task state in the plugin before restarting 1029 if err := tr.driver.DestroyTask(taskHandle.Config.ID, true); err != nil { 1030 // Ignore ErrTaskNotFound errors as ideally 1031 // this task has already been stopped and 1032 // therefore doesn't exist. 1033 if err != drivers.ErrTaskNotFound { 1034 tr.logger.Warn("error destroying unrecoverable task", 1035 "error", err, "task_id", taskHandle.Config.ID) 1036 } 1037 1038 return false 1039 } 1040 1041 return true 1042 } 1043 1044 // Update driver handle on task runner 1045 tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net)) 1046 return true 1047 } 1048 1049 // UpdateState sets the task runners allocation state and triggers a server 1050 // update. 1051 func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) { 1052 tr.stateLock.Lock() 1053 defer tr.stateLock.Unlock() 1054 1055 if event != nil { 1056 tr.logger.Trace("setting task state", "state", state, "event", event.Type) 1057 1058 // Append the event 1059 tr.appendEvent(event) 1060 } 1061 1062 // Update the state 1063 if err := tr.updateStateImpl(state); err != nil { 1064 // Only log the error as we persistence errors should not 1065 // affect task state. 1066 tr.logger.Error("error persisting task state", "error", err, "event", event, "state", state) 1067 } 1068 1069 // Notify the alloc runner of the transition 1070 tr.stateUpdater.TaskStateUpdated() 1071 } 1072 1073 // updateStateImpl updates the in-memory task state and persists to disk. 1074 func (tr *TaskRunner) updateStateImpl(state string) error { 1075 1076 // Update the task state 1077 oldState := tr.state.State 1078 taskState := tr.state 1079 taskState.State = state 1080 1081 // Handle the state transition. 1082 switch state { 1083 case structs.TaskStateRunning: 1084 // Capture the start time if it is just starting 1085 if oldState != structs.TaskStateRunning { 1086 taskState.StartedAt = time.Now().UTC() 1087 if !tr.clientConfig.DisableTaggedMetrics { 1088 metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels) 1089 } 1090 //if r.config.BackwardsCompatibleMetrics { 1091 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) 1092 //} 1093 } 1094 case structs.TaskStateDead: 1095 // Capture the finished time if not already set 1096 if taskState.FinishedAt.IsZero() { 1097 taskState.FinishedAt = time.Now().UTC() 1098 } 1099 1100 // Emitting metrics to indicate task complete and failures 1101 if taskState.Failed { 1102 if !tr.clientConfig.DisableTaggedMetrics { 1103 metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels) 1104 } 1105 //if r.config.BackwardsCompatibleMetrics { 1106 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1) 1107 //} 1108 } else { 1109 if !tr.clientConfig.DisableTaggedMetrics { 1110 metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, 1, tr.baseLabels) 1111 } 1112 //if r.config.BackwardsCompatibleMetrics { 1113 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1) 1114 //} 1115 } 1116 } 1117 1118 // Persist the state and event 1119 return tr.stateDB.PutTaskState(tr.allocID, tr.taskName, taskState) 1120 } 1121 1122 // EmitEvent appends a new TaskEvent to this task's TaskState. The actual 1123 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1124 // transition states. 1125 // Events are persisted locally and sent to the server, but errors are simply 1126 // logged. Use AppendEvent to simply add a new event. 1127 func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) { 1128 tr.stateLock.Lock() 1129 defer tr.stateLock.Unlock() 1130 1131 tr.appendEvent(event) 1132 1133 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1134 // Only a warning because the next event/state-transition will 1135 // try to persist it again. 1136 tr.logger.Warn("error persisting event", "error", err, "event", event) 1137 } 1138 1139 // Notify the alloc runner of the event 1140 tr.stateUpdater.TaskStateUpdated() 1141 } 1142 1143 // AppendEvent appends a new TaskEvent to this task's TaskState. The actual 1144 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1145 // transition states. 1146 // Events are persisted locally and errors are simply logged. Use EmitEvent 1147 // also update AllocRunner. 1148 func (tr *TaskRunner) AppendEvent(event *structs.TaskEvent) { 1149 tr.stateLock.Lock() 1150 defer tr.stateLock.Unlock() 1151 1152 tr.appendEvent(event) 1153 1154 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1155 // Only a warning because the next event/state-transition will 1156 // try to persist it again. 1157 tr.logger.Warn("error persisting event", "error", err, "event", event) 1158 } 1159 } 1160 1161 // appendEvent to task's event slice. Caller must acquire stateLock. 1162 func (tr *TaskRunner) appendEvent(event *structs.TaskEvent) error { 1163 // Ensure the event is populated with human readable strings 1164 event.PopulateEventDisplayMessage() 1165 1166 // Propagate failure from event to task state 1167 if event.FailsTask { 1168 tr.state.Failed = true 1169 } 1170 1171 // XXX This seems like a super awkward spot for this? Why not shouldRestart? 1172 // Update restart metrics 1173 if event.Type == structs.TaskRestarting { 1174 if !tr.clientConfig.DisableTaggedMetrics { 1175 metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels) 1176 } 1177 //if r.config.BackwardsCompatibleMetrics { 1178 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) 1179 //} 1180 tr.state.Restarts++ 1181 tr.state.LastRestart = time.Unix(0, event.Time) 1182 } 1183 1184 // Append event to slice 1185 appendTaskEvent(tr.state, event, tr.maxEvents) 1186 1187 return nil 1188 } 1189 1190 // WaitCh is closed when TaskRunner.Run exits. 1191 func (tr *TaskRunner) WaitCh() <-chan struct{} { 1192 return tr.waitCh 1193 } 1194 1195 // Update the running allocation with a new version received from the server. 1196 // Calls Update hooks asynchronously with Run. 1197 // 1198 // This method is safe for calling concurrently with Run and does not modify 1199 // the passed in allocation. 1200 func (tr *TaskRunner) Update(update *structs.Allocation) { 1201 task := update.LookupTask(tr.taskName) 1202 if task == nil { 1203 // This should not happen and likely indicates a bug in the 1204 // server or client. 1205 tr.logger.Error("allocation update is missing task; killing", 1206 "group", update.TaskGroup) 1207 te := structs.NewTaskEvent(structs.TaskKilled). 1208 SetKillReason("update missing task"). 1209 SetFailsTask() 1210 tr.Kill(context.Background(), te) 1211 return 1212 } 1213 1214 // Update tr.alloc 1215 tr.setAlloc(update, task) 1216 1217 // Trigger update hooks if not terminal 1218 if !update.TerminalStatus() { 1219 tr.triggerUpdateHooks() 1220 } 1221 } 1222 1223 // SetNetworkIsolation is called by the PreRun allocation hook after configuring 1224 // the network isolation for the allocation 1225 func (tr *TaskRunner) SetNetworkIsolation(n *drivers.NetworkIsolationSpec) { 1226 tr.networkIsolationLock.Lock() 1227 tr.networkIsolationSpec = n 1228 tr.networkIsolationLock.Unlock() 1229 } 1230 1231 // triggerUpdate if there isn't already an update pending. Should be called 1232 // instead of calling updateHooks directly to serialize runs of update hooks. 1233 // TaskRunner state should be updated prior to triggering update hooks. 1234 // 1235 // Does not block. 1236 func (tr *TaskRunner) triggerUpdateHooks() { 1237 select { 1238 case tr.triggerUpdateCh <- struct{}{}: 1239 default: 1240 // already an update hook pending 1241 } 1242 } 1243 1244 // Shutdown TaskRunner gracefully without affecting the state of the task. 1245 // Shutdown blocks until the main Run loop exits. 1246 func (tr *TaskRunner) Shutdown() { 1247 tr.logger.Trace("shutting down") 1248 tr.shutdownCtxCancel() 1249 1250 <-tr.WaitCh() 1251 1252 // Run shutdown hooks to cleanup 1253 tr.shutdownHooks() 1254 1255 // Persist once more 1256 tr.persistLocalState() 1257 } 1258 1259 // LatestResourceUsage returns the last resource utilization datapoint 1260 // collected. May return nil if the task is not running or no resource 1261 // utilization has been collected yet. 1262 func (tr *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1263 tr.resourceUsageLock.Lock() 1264 ru := tr.resourceUsage 1265 tr.resourceUsageLock.Unlock() 1266 1267 // Look up device statistics lazily when fetched, as currently we do not emit any stats for them yet 1268 if ru != nil && tr.deviceStatsReporter != nil { 1269 deviceResources := tr.taskResources.Devices 1270 ru.ResourceUsage.DeviceStats = tr.deviceStatsReporter.LatestDeviceResourceStats(deviceResources) 1271 } 1272 return ru 1273 } 1274 1275 // UpdateStats updates and emits the latest stats from the driver. 1276 func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) { 1277 tr.resourceUsageLock.Lock() 1278 tr.resourceUsage = ru 1279 tr.resourceUsageLock.Unlock() 1280 if ru != nil { 1281 tr.emitStats(ru) 1282 } 1283 } 1284 1285 //TODO Remove Backwardscompat or use tr.Alloc()? 1286 func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { 1287 alloc := tr.Alloc() 1288 var allocatedMem float32 1289 if alloc.AllocatedResources != nil { 1290 if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { 1291 // Convert to bytes to match other memory metrics 1292 allocatedMem = float32(taskRes.Memory.MemoryMB) * 1024 * 1024 1293 } 1294 } else if taskRes := alloc.TaskResources[tr.taskName]; taskRes != nil { 1295 // COMPAT(0.11) Remove in 0.11 when TaskResources is removed 1296 allocatedMem = float32(taskRes.MemoryMB) * 1024 * 1024 1297 1298 } 1299 1300 if !tr.clientConfig.DisableTaggedMetrics { 1301 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1302 float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels) 1303 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"}, 1304 float32(ru.ResourceUsage.MemoryStats.Cache), tr.baseLabels) 1305 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"}, 1306 float32(ru.ResourceUsage.MemoryStats.Swap), tr.baseLabels) 1307 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "usage"}, 1308 float32(ru.ResourceUsage.MemoryStats.Usage), tr.baseLabels) 1309 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"}, 1310 float32(ru.ResourceUsage.MemoryStats.MaxUsage), tr.baseLabels) 1311 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"}, 1312 float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels) 1313 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, 1314 float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels) 1315 if allocatedMem > 0 { 1316 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "allocated"}, 1317 allocatedMem, tr.baseLabels) 1318 } 1319 } 1320 1321 if tr.clientConfig.BackwardsCompatibleMetrics { 1322 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1323 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1324 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1325 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage)) 1326 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1327 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1328 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1329 if allocatedMem > 0 { 1330 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "allocated"}, allocatedMem) 1331 } 1332 } 1333 } 1334 1335 //TODO Remove Backwardscompat or use tr.Alloc()? 1336 func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) { 1337 alloc := tr.Alloc() 1338 var allocatedCPU float32 1339 if alloc.AllocatedResources != nil { 1340 if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { 1341 allocatedCPU = float32(taskRes.Cpu.CpuShares) 1342 } 1343 } else if taskRes := alloc.TaskResources[tr.taskName]; taskRes != nil { 1344 // COMPAT(0.11) Remove in 0.11 when TaskResources is removed 1345 allocatedCPU = float32(taskRes.CPU) 1346 } 1347 1348 if !tr.clientConfig.DisableTaggedMetrics { 1349 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"}, 1350 float32(ru.ResourceUsage.CpuStats.Percent), tr.baseLabels) 1351 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"}, 1352 float32(ru.ResourceUsage.CpuStats.SystemMode), tr.baseLabels) 1353 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"}, 1354 float32(ru.ResourceUsage.CpuStats.UserMode), tr.baseLabels) 1355 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"}, 1356 float32(ru.ResourceUsage.CpuStats.ThrottledTime), tr.baseLabels) 1357 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"}, 1358 float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels) 1359 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"}, 1360 float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels) 1361 if allocatedCPU > 0 { 1362 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "allocated"}, 1363 allocatedCPU, tr.baseLabels) 1364 } 1365 } 1366 1367 if tr.clientConfig.BackwardsCompatibleMetrics { 1368 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1369 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1370 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1371 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1372 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1373 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1374 if allocatedCPU > 0 { 1375 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "allocated"}, allocatedCPU) 1376 } 1377 } 1378 } 1379 1380 // emitStats emits resource usage stats of tasks to remote metrics collector 1381 // sinks 1382 func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1383 if !tr.clientConfig.PublishAllocationMetrics { 1384 return 1385 } 1386 1387 if ru.ResourceUsage.MemoryStats != nil { 1388 tr.setGaugeForMemory(ru) 1389 } else { 1390 tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil") 1391 } 1392 1393 if ru.ResourceUsage.CpuStats != nil { 1394 tr.setGaugeForCPU(ru) 1395 } else { 1396 tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil") 1397 } 1398 } 1399 1400 // appendTaskEvent updates the task status by appending the new event. 1401 func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent, capacity int) { 1402 if state.Events == nil { 1403 state.Events = make([]*structs.TaskEvent, 1, capacity) 1404 state.Events[0] = event 1405 return 1406 } 1407 1408 // If we hit capacity, then shift it. 1409 if len(state.Events) == capacity { 1410 old := state.Events 1411 state.Events = make([]*structs.TaskEvent, 0, capacity) 1412 state.Events = append(state.Events, old[1:]...) 1413 } 1414 1415 state.Events = append(state.Events, event) 1416 } 1417 1418 func (tr *TaskRunner) TaskExecHandler() drivermanager.TaskExecHandler { 1419 // Check it is running 1420 handle := tr.getDriverHandle() 1421 if handle == nil { 1422 return nil 1423 } 1424 return handle.ExecStreaming 1425 } 1426 1427 func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) { 1428 return tr.driver.Capabilities() 1429 }