github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/taskrunner/task_runner.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "strings" 8 "sync" 9 "time" 10 11 metrics "github.com/armon/go-metrics" 12 log "github.com/hashicorp/go-hclog" 13 multierror "github.com/hashicorp/go-multierror" 14 "github.com/hashicorp/hcl2/hcldec" 15 "github.com/hashicorp/nomad/client/allocdir" 16 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 17 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts" 18 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" 19 "github.com/hashicorp/nomad/client/config" 20 "github.com/hashicorp/nomad/client/consul" 21 "github.com/hashicorp/nomad/client/devicemanager" 22 "github.com/hashicorp/nomad/client/dynamicplugins" 23 cinterfaces "github.com/hashicorp/nomad/client/interfaces" 24 "github.com/hashicorp/nomad/client/pluginmanager/csimanager" 25 "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" 26 cstate "github.com/hashicorp/nomad/client/state" 27 cstructs "github.com/hashicorp/nomad/client/structs" 28 "github.com/hashicorp/nomad/client/taskenv" 29 "github.com/hashicorp/nomad/client/vaultclient" 30 "github.com/hashicorp/nomad/helper/pluginutils/hclspecutils" 31 "github.com/hashicorp/nomad/helper/pluginutils/hclutils" 32 "github.com/hashicorp/nomad/helper/uuid" 33 "github.com/hashicorp/nomad/nomad/structs" 34 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 35 "github.com/hashicorp/nomad/plugins/drivers" 36 ) 37 38 const ( 39 // defaultMaxEvents is the default max capacity for task events on the 40 // task state. Overrideable for testing. 41 defaultMaxEvents = 10 42 43 // killBackoffBaseline is the baseline time for exponential backoff while 44 // killing a task. 45 killBackoffBaseline = 5 * time.Second 46 47 // killBackoffLimit is the limit of the exponential backoff for killing 48 // the task. 49 killBackoffLimit = 2 * time.Minute 50 51 // killFailureLimit is how many times we will attempt to kill a task before 52 // giving up and potentially leaking resources. 53 killFailureLimit = 5 54 55 // triggerUpdateChCap is the capacity for the triggerUpdateCh used for 56 // triggering updates. It should be exactly 1 as even if multiple 57 // updates have come in since the last one was handled, we only need to 58 // handle the last one. 59 triggerUpdateChCap = 1 60 ) 61 62 type TaskRunner struct { 63 // allocID, taskName, taskLeader, and taskResources are immutable so these fields may 64 // be accessed without locks 65 allocID string 66 taskName string 67 taskLeader bool 68 taskResources *structs.AllocatedTaskResources 69 70 alloc *structs.Allocation 71 allocLock sync.Mutex 72 73 clientConfig *config.Config 74 75 // stateUpdater is used to emit updated task state 76 stateUpdater interfaces.TaskStateHandler 77 78 // state captures the state of the task for updating the allocation 79 // Must acquire stateLock to access. 80 state *structs.TaskState 81 82 // localState captures the node-local state of the task for when the 83 // Nomad agent restarts. 84 // Must acquire stateLock to access. 85 localState *state.LocalState 86 87 // stateLock must be acquired when accessing state or localState. 88 stateLock sync.RWMutex 89 90 // stateDB is for persisting localState and taskState 91 stateDB cstate.StateDB 92 93 // shutdownCtx is used to exit the TaskRunner *without* affecting task state. 94 shutdownCtx context.Context 95 96 // shutdownCtxCancel causes the TaskRunner to exit immediately without 97 // affecting task state. Useful for testing or graceful agent shutdown. 98 shutdownCtxCancel context.CancelFunc 99 100 // killCtx is the task runner's context representing the tasks's lifecycle. 101 // The context is canceled when the task is killed. 102 killCtx context.Context 103 104 // killCtxCancel is called when killing a task. 105 killCtxCancel context.CancelFunc 106 107 // killErr is populated when killing a task. Access should be done use the 108 // getter/setter 109 killErr error 110 killErrLock sync.Mutex 111 112 // Logger is the logger for the task runner. 113 logger log.Logger 114 115 // triggerUpdateCh is ticked whenever update hooks need to be run and 116 // must be created with cap=1 to signal a pending update and prevent 117 // callers from deadlocking if the receiver has exited. 118 triggerUpdateCh chan struct{} 119 120 // waitCh is closed when the task runner has transitioned to a terminal 121 // state 122 waitCh chan struct{} 123 124 // driver is the driver for the task. 125 driver drivers.DriverPlugin 126 127 // driverCapabilities is the set capabilities the driver supports 128 driverCapabilities *drivers.Capabilities 129 130 // taskSchema is the hcl spec for the task driver configuration 131 taskSchema hcldec.Spec 132 133 // handleLock guards access to handle and handleResult 134 handleLock sync.Mutex 135 136 // handle to the running driver 137 handle *DriverHandle 138 139 // task is the task being run 140 task *structs.Task 141 taskLock sync.RWMutex 142 143 // taskDir is the directory structure for this task. 144 taskDir *allocdir.TaskDir 145 146 // envBuilder is used to build the task's environment 147 envBuilder *taskenv.Builder 148 149 // restartTracker is used to decide if the task should be restarted. 150 restartTracker *restarts.RestartTracker 151 152 // runnerHooks are task runner lifecycle hooks that should be run on state 153 // transistions. 154 runnerHooks []interfaces.TaskHook 155 156 // hookResources captures the resources provided by hooks 157 hookResources *hookResources 158 159 // consulClient is the client used by the consul service hook for 160 // registering services and checks 161 consulClient consul.ConsulServiceAPI 162 163 // sidsClient is the client used by the service identity hook for managing 164 // service identity tokens 165 siClient consul.ServiceIdentityAPI 166 167 // vaultClient is the client to use to derive and renew Vault tokens 168 vaultClient vaultclient.VaultClient 169 170 // vaultToken is the current Vault token. It should be accessed with the 171 // getter. 172 vaultToken string 173 vaultTokenLock sync.Mutex 174 175 // baseLabels are used when emitting tagged metrics. All task runner metrics 176 // will have these tags, and optionally more. 177 baseLabels []metrics.Label 178 179 // logmonHookConfig is used to get the paths to the stdout and stderr fifos 180 // to be passed to the driver for task logging 181 logmonHookConfig *logmonHookConfig 182 183 // resourceUsage is written via UpdateStats and read via 184 // LatestResourceUsage. May be nil at all times. 185 resourceUsage *cstructs.TaskResourceUsage 186 resourceUsageLock sync.Mutex 187 188 // deviceStatsReporter is used to lookup resource usage for alloc devices 189 deviceStatsReporter cinterfaces.DeviceStatsReporter 190 191 // csiManager is used to manage the mounting of CSI volumes into tasks 192 csiManager csimanager.Manager 193 194 // devicemanager is used to mount devices as well as lookup device 195 // statistics 196 devicemanager devicemanager.Manager 197 198 // driverManager is used to dispense driver plugins and register event 199 // handlers 200 driverManager drivermanager.Manager 201 202 // dynamicRegistry is where dynamic plugins should be registered. 203 dynamicRegistry dynamicplugins.Registry 204 205 // maxEvents is the capacity of the TaskEvents on the TaskState. 206 // Defaults to defaultMaxEvents but overrideable for testing. 207 maxEvents int 208 209 // serversContactedCh is passed to TaskRunners so they can detect when 210 // GetClientAllocs has been called in case of a failed restore. 211 serversContactedCh <-chan struct{} 212 213 // startConditionMetCtx is done when TR should start the task 214 startConditionMetCtx <-chan struct{} 215 216 // waitOnServers defaults to false but will be set true if a restore 217 // fails and the Run method should wait until serversContactedCh is 218 // closed. 219 waitOnServers bool 220 221 networkIsolationLock sync.Mutex 222 networkIsolationSpec *drivers.NetworkIsolationSpec 223 224 allocHookResources *cstructs.AllocHookResources 225 } 226 227 type Config struct { 228 Alloc *structs.Allocation 229 ClientConfig *config.Config 230 Task *structs.Task 231 TaskDir *allocdir.TaskDir 232 Logger log.Logger 233 234 // Consul is the client to use for managing Consul service registrations 235 Consul consul.ConsulServiceAPI 236 237 // ConsulSI is the client to use for managing Consul SI tokens 238 ConsulSI consul.ServiceIdentityAPI 239 240 // DynamicRegistry is where dynamic plugins should be registered. 241 DynamicRegistry dynamicplugins.Registry 242 243 // Vault is the client to use to derive and renew Vault tokens 244 Vault vaultclient.VaultClient 245 246 // StateDB is used to store and restore state. 247 StateDB cstate.StateDB 248 249 // StateUpdater is used to emit updated task state 250 StateUpdater interfaces.TaskStateHandler 251 252 // deviceStatsReporter is used to lookup resource usage for alloc devices 253 DeviceStatsReporter cinterfaces.DeviceStatsReporter 254 255 // CSIManager is used to manage the mounting of CSI volumes into tasks 256 CSIManager csimanager.Manager 257 258 // DeviceManager is used to mount devices as well as lookup device 259 // statistics 260 DeviceManager devicemanager.Manager 261 262 // DriverManager is used to dispense driver plugins and register event 263 // handlers 264 DriverManager drivermanager.Manager 265 266 // ServersContactedCh is closed when the first GetClientAllocs call to 267 // servers succeeds and allocs are synced. 268 ServersContactedCh chan struct{} 269 270 // startConditionMetCtx is done when TR should start the task 271 StartConditionMetCtx <-chan struct{} 272 } 273 274 func NewTaskRunner(config *Config) (*TaskRunner, error) { 275 // Create a context for causing the runner to exit 276 trCtx, trCancel := context.WithCancel(context.Background()) 277 278 // Create a context for killing the runner 279 killCtx, killCancel := context.WithCancel(context.Background()) 280 281 // Initialize the environment builder 282 envBuilder := taskenv.NewBuilder( 283 config.ClientConfig.Node, 284 config.Alloc, 285 config.Task, 286 config.ClientConfig.Region, 287 ) 288 289 // Initialize state from alloc if it is set 290 tstate := structs.NewTaskState() 291 if ts := config.Alloc.TaskStates[config.Task.Name]; ts != nil { 292 tstate = ts.Copy() 293 } 294 295 tr := &TaskRunner{ 296 alloc: config.Alloc, 297 allocID: config.Alloc.ID, 298 clientConfig: config.ClientConfig, 299 task: config.Task, 300 taskDir: config.TaskDir, 301 taskName: config.Task.Name, 302 taskLeader: config.Task.Leader, 303 envBuilder: envBuilder, 304 dynamicRegistry: config.DynamicRegistry, 305 consulClient: config.Consul, 306 siClient: config.ConsulSI, 307 vaultClient: config.Vault, 308 state: tstate, 309 localState: state.NewLocalState(), 310 stateDB: config.StateDB, 311 stateUpdater: config.StateUpdater, 312 deviceStatsReporter: config.DeviceStatsReporter, 313 killCtx: killCtx, 314 killCtxCancel: killCancel, 315 shutdownCtx: trCtx, 316 shutdownCtxCancel: trCancel, 317 triggerUpdateCh: make(chan struct{}, triggerUpdateChCap), 318 waitCh: make(chan struct{}), 319 csiManager: config.CSIManager, 320 devicemanager: config.DeviceManager, 321 driverManager: config.DriverManager, 322 maxEvents: defaultMaxEvents, 323 serversContactedCh: config.ServersContactedCh, 324 startConditionMetCtx: config.StartConditionMetCtx, 325 } 326 327 // Create the logger based on the allocation ID 328 tr.logger = config.Logger.Named("task_runner").With("task", config.Task.Name) 329 330 // Pull out the task's resources 331 ares := tr.alloc.AllocatedResources 332 if ares == nil { 333 return nil, fmt.Errorf("no task resources found on allocation") 334 } 335 336 tres, ok := ares.Tasks[tr.taskName] 337 if !ok { 338 return nil, fmt.Errorf("no task resources found on allocation") 339 } 340 tr.taskResources = tres 341 342 // Build the restart tracker. 343 rp := config.Task.RestartPolicy 344 if rp == nil { 345 tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup) 346 if tg == nil { 347 tr.logger.Error("alloc missing task group") 348 return nil, fmt.Errorf("alloc missing task group") 349 } 350 rp = tg.RestartPolicy 351 } 352 tr.restartTracker = restarts.NewRestartTracker(rp, tr.alloc.Job.Type, config.Task.Lifecycle) 353 354 // Get the driver 355 if err := tr.initDriver(); err != nil { 356 tr.logger.Error("failed to create driver", "error", err) 357 return nil, err 358 } 359 360 // Initialize the runners hooks. 361 tr.initHooks() 362 363 // Initialize base labels 364 tr.initLabels() 365 366 // Initialize initial task received event 367 tr.appendEvent(structs.NewTaskEvent(structs.TaskReceived)) 368 369 return tr, nil 370 } 371 372 func (tr *TaskRunner) initLabels() { 373 alloc := tr.Alloc() 374 tr.baseLabels = []metrics.Label{ 375 { 376 Name: "job", 377 Value: alloc.Job.Name, 378 }, 379 { 380 Name: "task_group", 381 Value: alloc.TaskGroup, 382 }, 383 { 384 Name: "alloc_id", 385 Value: tr.allocID, 386 }, 387 { 388 Name: "task", 389 Value: tr.taskName, 390 }, 391 { 392 Name: "namespace", 393 Value: tr.alloc.Namespace, 394 }, 395 } 396 397 if tr.alloc.Job.ParentID != "" { 398 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 399 Name: "parent_id", 400 Value: tr.alloc.Job.ParentID, 401 }) 402 if strings.Contains(tr.alloc.Job.Name, "/dispatch-") { 403 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 404 Name: "dispatch_id", 405 Value: strings.Split(tr.alloc.Job.Name, "/dispatch-")[1], 406 }) 407 } 408 if strings.Contains(tr.alloc.Job.Name, "/periodic-") { 409 tr.baseLabels = append(tr.baseLabels, metrics.Label{ 410 Name: "periodic_id", 411 Value: strings.Split(tr.alloc.Job.Name, "/periodic-")[1], 412 }) 413 } 414 } 415 } 416 417 // Mark a task as failed and not to run. Aimed to be invoked when alloc runner 418 // prestart hooks failed. 419 // Should never be called with Run(). 420 func (tr *TaskRunner) MarkFailedDead(reason string) { 421 defer close(tr.waitCh) 422 423 tr.stateLock.Lock() 424 if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { 425 //TODO Nomad will be unable to restore this task; try to kill 426 // it now and fail? In general we prefer to leave running 427 // tasks running even if the agent encounters an error. 428 tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart", 429 "error", err) 430 } 431 tr.stateLock.Unlock() 432 433 event := structs.NewTaskEvent(structs.TaskSetupFailure). 434 SetDisplayMessage(reason). 435 SetFailsTask() 436 tr.UpdateState(structs.TaskStateDead, event) 437 438 // Run the stop hooks in case task was a restored task that failed prestart 439 if err := tr.stop(); err != nil { 440 tr.logger.Error("stop failed while marking task dead", "error", err) 441 } 442 } 443 444 // Run the TaskRunner. Starts the user's task or reattaches to a restored task. 445 // Run closes WaitCh when it exits. Should be started in a goroutine. 446 func (tr *TaskRunner) Run() { 447 defer close(tr.waitCh) 448 var result *drivers.ExitResult 449 450 tr.stateLock.RLock() 451 dead := tr.state.State == structs.TaskStateDead 452 tr.stateLock.RUnlock() 453 454 // if restoring a dead task, ensure that task is cleared and all post hooks 455 // are called without additional state updates 456 if dead { 457 // do cleanup functions without emitting any additional events/work 458 // to handle cases where we restored a dead task where client terminated 459 // after task finished before completing post-run actions. 460 tr.clearDriverHandle() 461 tr.stateUpdater.TaskStateUpdated() 462 if err := tr.stop(); err != nil { 463 tr.logger.Error("stop failed on terminal task", "error", err) 464 } 465 return 466 } 467 468 // Updates are handled asynchronously with the other hooks but each 469 // triggered update - whether due to alloc updates or a new vault token 470 // - should be handled serially. 471 go tr.handleUpdates() 472 473 // If restore failed wait until servers are contacted before running. 474 // #1795 475 if tr.waitOnServers { 476 tr.logger.Info("task failed to restore; waiting to contact server before restarting") 477 select { 478 case <-tr.killCtx.Done(): 479 case <-tr.shutdownCtx.Done(): 480 return 481 case <-tr.serversContactedCh: 482 tr.logger.Info("server contacted; unblocking waiting task") 483 } 484 } 485 486 select { 487 case <-tr.startConditionMetCtx: 488 // yay proceed 489 case <-tr.killCtx.Done(): 490 case <-tr.shutdownCtx.Done(): 491 return 492 } 493 494 MAIN: 495 for !tr.Alloc().TerminalStatus() { 496 select { 497 case <-tr.killCtx.Done(): 498 break MAIN 499 case <-tr.shutdownCtx.Done(): 500 // TaskRunner was told to exit immediately 501 return 502 default: 503 } 504 505 // Run the prestart hooks 506 if err := tr.prestart(); err != nil { 507 tr.logger.Error("prestart failed", "error", err) 508 tr.restartTracker.SetStartError(err) 509 goto RESTART 510 } 511 512 select { 513 case <-tr.killCtx.Done(): 514 break MAIN 515 case <-tr.shutdownCtx.Done(): 516 // TaskRunner was told to exit immediately 517 return 518 default: 519 } 520 521 // Run the task 522 if err := tr.runDriver(); err != nil { 523 tr.logger.Error("running driver failed", "error", err) 524 tr.restartTracker.SetStartError(err) 525 goto RESTART 526 } 527 528 // Run the poststart hooks 529 if err := tr.poststart(); err != nil { 530 tr.logger.Error("poststart failed", "error", err) 531 } 532 533 // Grab the result proxy and wait for task to exit 534 WAIT: 535 { 536 handle := tr.getDriverHandle() 537 result = nil 538 539 // Do *not* use tr.killCtx here as it would cause 540 // Wait() to unblock before the task exits when Kill() 541 // is called. 542 if resultCh, err := handle.WaitCh(context.Background()); err != nil { 543 tr.logger.Error("wait task failed", "error", err) 544 } else { 545 select { 546 case <-tr.killCtx.Done(): 547 // We can go through the normal should restart check since 548 // the restart tracker knowns it is killed 549 result = tr.handleKill() 550 case <-tr.shutdownCtx.Done(): 551 // TaskRunner was told to exit immediately 552 return 553 case result = <-resultCh: 554 } 555 556 // WaitCh returned a result 557 if retryWait := tr.handleTaskExitResult(result); retryWait { 558 goto WAIT 559 } 560 } 561 } 562 563 // Clear the handle 564 tr.clearDriverHandle() 565 566 // Store the wait result on the restart tracker 567 tr.restartTracker.SetExitResult(result) 568 569 if err := tr.exited(); err != nil { 570 tr.logger.Error("exited hooks failed", "error", err) 571 } 572 573 RESTART: 574 restart, restartDelay := tr.shouldRestart() 575 if !restart { 576 break MAIN 577 } 578 579 // Actually restart by sleeping and also watching for destroy events 580 select { 581 case <-time.After(restartDelay): 582 case <-tr.killCtx.Done(): 583 tr.logger.Trace("task killed between restarts", "delay", restartDelay) 584 break MAIN 585 case <-tr.shutdownCtx.Done(): 586 // TaskRunner was told to exit immediately 587 tr.logger.Trace("gracefully shutting down during restart delay") 588 return 589 } 590 } 591 592 // Ensure handle is cleaned up. Restore could have recovered a task 593 // that should be terminal, so if the handle still exists we should 594 // kill it here. 595 if tr.getDriverHandle() != nil { 596 if result = tr.handleKill(); result != nil { 597 tr.emitExitResultEvent(result) 598 } 599 600 tr.clearDriverHandle() 601 602 if err := tr.exited(); err != nil { 603 tr.logger.Error("exited hooks failed while cleaning up terminal task", "error", err) 604 } 605 } 606 607 // Mark the task as dead 608 tr.UpdateState(structs.TaskStateDead, nil) 609 610 // Run the stop hooks 611 if err := tr.stop(); err != nil { 612 tr.logger.Error("stop failed", "error", err) 613 } 614 615 tr.logger.Debug("task run loop exiting") 616 } 617 618 // handleTaskExitResult handles the results returned by the task exiting. If 619 // retryWait is true, the caller should attempt to wait on the task again since 620 // it has not actually finished running. This can happen if the driver plugin 621 // has exited. 622 func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWait bool) { 623 if result == nil { 624 return false 625 } 626 627 if result.Err == bstructs.ErrPluginShutdown { 628 dn := tr.Task().Driver 629 tr.logger.Debug("driver plugin has shutdown; attempting to recover task", "driver", dn) 630 631 // Initialize a new driver handle 632 if err := tr.initDriver(); err != nil { 633 tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err, "driver", dn) 634 return false 635 } 636 637 // Try to restore the handle 638 tr.stateLock.RLock() 639 h := tr.localState.TaskHandle 640 net := tr.localState.DriverNetwork 641 tr.stateLock.RUnlock() 642 if !tr.restoreHandle(h, net) { 643 tr.logger.Error("failed to restore handle on driver after it exited unexpectedly", "driver", dn) 644 return false 645 } 646 647 tr.logger.Debug("task successfully recovered on driver", "driver", dn) 648 return true 649 } 650 651 // Emit Terminated event 652 tr.emitExitResultEvent(result) 653 654 return false 655 } 656 657 // emitExitResultEvent emits a TaskTerminated event for an ExitResult. 658 func (tr *TaskRunner) emitExitResultEvent(result *drivers.ExitResult) { 659 event := structs.NewTaskEvent(structs.TaskTerminated). 660 SetExitCode(result.ExitCode). 661 SetSignal(result.Signal). 662 SetOOMKilled(result.OOMKilled). 663 SetExitMessage(result.Err) 664 665 tr.EmitEvent(event) 666 667 if result.OOMKilled && !tr.clientConfig.DisableTaggedMetrics { 668 metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels) 669 } 670 } 671 672 // handleUpdates runs update hooks when triggerUpdateCh is ticked and exits 673 // when Run has returned. Should only be run in a goroutine from Run. 674 func (tr *TaskRunner) handleUpdates() { 675 for { 676 select { 677 case <-tr.triggerUpdateCh: 678 case <-tr.waitCh: 679 return 680 } 681 682 // Non-terminal update; run hooks 683 tr.updateHooks() 684 } 685 } 686 687 // shouldRestart determines whether the task should be restarted and updates 688 // the task state unless the task is killed or terminated. 689 func (tr *TaskRunner) shouldRestart() (bool, time.Duration) { 690 // Determine if we should restart 691 state, when := tr.restartTracker.GetState() 692 reason := tr.restartTracker.GetReason() 693 switch state { 694 case structs.TaskKilled: 695 // Never restart an explicitly killed task. Kill method handles 696 // updating the server. 697 tr.EmitEvent(structs.NewTaskEvent(state)) 698 return false, 0 699 case structs.TaskNotRestarting, structs.TaskTerminated: 700 tr.logger.Info("not restarting task", "reason", reason) 701 if state == structs.TaskNotRestarting { 702 tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting).SetRestartReason(reason).SetFailsTask()) 703 } 704 return false, 0 705 case structs.TaskRestarting: 706 tr.logger.Info("restarting task", "reason", reason, "delay", when) 707 tr.UpdateState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when).SetRestartReason(reason)) 708 return true, when 709 default: 710 tr.logger.Error("restart tracker returned unknown state", "state", state) 711 return true, when 712 } 713 } 714 715 // runDriver runs the driver and waits for it to exit 716 // runDriver emits an appropriate task event on success/failure 717 func (tr *TaskRunner) runDriver() error { 718 719 taskConfig := tr.buildTaskConfig() 720 721 // Build hcl context variables 722 vars, errs, err := tr.envBuilder.Build().AllValues() 723 if err != nil { 724 return fmt.Errorf("error building environment variables: %v", err) 725 } 726 727 // Handle per-key errors 728 if len(errs) > 0 { 729 keys := make([]string, 0, len(errs)) 730 for k, err := range errs { 731 keys = append(keys, k) 732 733 if tr.logger.IsTrace() { 734 // Verbosely log every diagnostic for debugging 735 tr.logger.Trace("error building environment variables", "key", k, "error", err) 736 } 737 } 738 739 tr.logger.Warn("some environment variables not available for rendering", "keys", strings.Join(keys, ", ")) 740 } 741 742 val, diag, diagErrs := hclutils.ParseHclInterface(tr.task.Config, tr.taskSchema, vars) 743 if diag.HasErrors() { 744 parseErr := multierror.Append(errors.New("failed to parse config: "), diagErrs...) 745 tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(parseErr)) 746 return parseErr 747 } 748 749 if err := taskConfig.EncodeDriverConfig(val); err != nil { 750 encodeErr := fmt.Errorf("failed to encode driver config: %v", err) 751 tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(encodeErr)) 752 return encodeErr 753 } 754 755 // If there's already a task handle (eg from a Restore) there's nothing 756 // to do except update state. 757 if tr.getDriverHandle() != nil { 758 // Ensure running state is persisted but do *not* append a new 759 // task event as restoring is a client event and not relevant 760 // to a task's lifecycle. 761 if err := tr.updateStateImpl(structs.TaskStateRunning); err != nil { 762 //TODO return error and destroy task to avoid an orphaned task? 763 tr.logger.Warn("error persisting task state", "error", err) 764 } 765 return nil 766 } 767 768 // Start the job if there's no existing handle (or if RecoverTask failed) 769 handle, net, err := tr.driver.StartTask(taskConfig) 770 if err != nil { 771 // The plugin has died, try relaunching it 772 if err == bstructs.ErrPluginShutdown { 773 tr.logger.Info("failed to start task because plugin shutdown unexpectedly; attempting to recover") 774 if err := tr.initDriver(); err != nil { 775 taskErr := fmt.Errorf("failed to initialize driver after it exited unexpectedly: %v", err) 776 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr)) 777 return taskErr 778 } 779 780 handle, net, err = tr.driver.StartTask(taskConfig) 781 if err != nil { 782 taskErr := fmt.Errorf("failed to start task after driver exited unexpectedly: %v", err) 783 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr)) 784 return taskErr 785 } 786 } else { 787 // Do *NOT* wrap the error here without maintaining whether or not is Recoverable. 788 // You must emit a task event failure to be considered Recoverable 789 tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err)) 790 return err 791 } 792 } 793 794 tr.stateLock.Lock() 795 tr.localState.TaskHandle = handle 796 tr.localState.DriverNetwork = net 797 if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { 798 //TODO Nomad will be unable to restore this task; try to kill 799 // it now and fail? In general we prefer to leave running 800 // tasks running even if the agent encounters an error. 801 tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart", 802 "error", err, "task_id", handle.Config.ID) 803 } 804 tr.stateLock.Unlock() 805 806 tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) 807 808 // Emit an event that we started 809 tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 810 return nil 811 } 812 813 // initDriver retrives the DriverPlugin from the plugin loader for this task 814 func (tr *TaskRunner) initDriver() error { 815 driver, err := tr.driverManager.Dispense(tr.Task().Driver) 816 if err != nil { 817 return err 818 } 819 tr.driver = driver 820 821 schema, err := tr.driver.TaskConfigSchema() 822 if err != nil { 823 return err 824 } 825 spec, diag := hclspecutils.Convert(schema) 826 if diag.HasErrors() { 827 return multierror.Append(errors.New("failed to convert task schema"), diag.Errs()...) 828 } 829 tr.taskSchema = spec 830 831 caps, err := tr.driver.Capabilities() 832 if err != nil { 833 return err 834 } 835 tr.driverCapabilities = caps 836 837 return nil 838 } 839 840 // handleKill is used to handle the a request to kill a task. It will return 841 // the handle exit result if one is available and store any error in the task 842 // runner killErr value. 843 func (tr *TaskRunner) handleKill() *drivers.ExitResult { 844 // Run the pre killing hooks 845 tr.preKill() 846 847 // Wait for task ShutdownDelay after running prekill hooks 848 // This allows for things like service de-registration to run 849 // before waiting to kill task 850 if delay := tr.Task().ShutdownDelay; delay != 0 { 851 tr.logger.Debug("waiting before killing task", "shutdown_delay", delay) 852 time.Sleep(delay) 853 } 854 855 // Tell the restart tracker that the task has been killed so it doesn't 856 // attempt to restart it. 857 tr.restartTracker.SetKilled() 858 859 // Check it is running 860 handle := tr.getDriverHandle() 861 if handle == nil { 862 return nil 863 } 864 865 // Kill the task using an exponential backoff in-case of failures. 866 killErr := tr.killTask(handle) 867 if killErr != nil { 868 // We couldn't successfully destroy the resource created. 869 tr.logger.Error("failed to kill task. Resources may have been leaked", "error", killErr) 870 tr.setKillErr(killErr) 871 } 872 873 // Block until task has exited. 874 waitCh, err := handle.WaitCh(tr.shutdownCtx) 875 876 // The error should be nil or TaskNotFound, if it's something else then a 877 // failure in the driver or transport layer occurred 878 if err != nil { 879 if err == drivers.ErrTaskNotFound { 880 return nil 881 } 882 tr.logger.Error("failed to wait on task. Resources may have been leaked", "error", err) 883 tr.setKillErr(killErr) 884 return nil 885 } 886 887 select { 888 case result := <-waitCh: 889 return result 890 case <-tr.shutdownCtx.Done(): 891 return nil 892 } 893 } 894 895 // killTask kills the task handle. In the case that killing fails, 896 // killTask will retry with an exponential backoff and will give up at a 897 // given limit. Returns an error if the task could not be killed. 898 func (tr *TaskRunner) killTask(handle *DriverHandle) error { 899 // Cap the number of times we attempt to kill the task. 900 var err error 901 for i := 0; i < killFailureLimit; i++ { 902 if err = handle.Kill(); err != nil { 903 if err == drivers.ErrTaskNotFound { 904 tr.logger.Warn("couldn't find task to kill", "task_id", handle.ID()) 905 return nil 906 } 907 // Calculate the new backoff 908 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 909 if backoff > killBackoffLimit { 910 backoff = killBackoffLimit 911 } 912 913 tr.logger.Error("failed to kill task", "backoff", backoff, "error", err) 914 time.Sleep(backoff) 915 } else { 916 // Kill was successful 917 return nil 918 } 919 } 920 return err 921 } 922 923 // persistLocalState persists local state to disk synchronously. 924 func (tr *TaskRunner) persistLocalState() error { 925 tr.stateLock.RLock() 926 defer tr.stateLock.RUnlock() 927 928 return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState) 929 } 930 931 // buildTaskConfig builds a drivers.TaskConfig with an unique ID for the task. 932 // The ID is unique for every invocation, it is built from the alloc ID, task 933 // name and 8 random characters. 934 func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig { 935 task := tr.Task() 936 alloc := tr.Alloc() 937 invocationid := uuid.Generate()[:8] 938 taskResources := tr.taskResources 939 env := tr.envBuilder.Build() 940 tr.networkIsolationLock.Lock() 941 defer tr.networkIsolationLock.Unlock() 942 943 return &drivers.TaskConfig{ 944 ID: fmt.Sprintf("%s/%s/%s", alloc.ID, task.Name, invocationid), 945 Name: task.Name, 946 JobName: alloc.Job.Name, 947 TaskGroupName: alloc.TaskGroup, 948 Resources: &drivers.Resources{ 949 NomadResources: taskResources, 950 LinuxResources: &drivers.LinuxResources{ 951 MemoryLimitBytes: taskResources.Memory.MemoryMB * 1024 * 1024, 952 CPUShares: taskResources.Cpu.CpuShares, 953 PercentTicks: float64(taskResources.Cpu.CpuShares) / float64(tr.clientConfig.Node.NodeResources.Cpu.CpuShares), 954 }, 955 }, 956 Devices: tr.hookResources.getDevices(), 957 Mounts: tr.hookResources.getMounts(), 958 Env: env.Map(), 959 DeviceEnv: env.DeviceEnv(), 960 User: task.User, 961 AllocDir: tr.taskDir.AllocDir, 962 StdoutPath: tr.logmonHookConfig.stdoutFifo, 963 StderrPath: tr.logmonHookConfig.stderrFifo, 964 AllocID: tr.allocID, 965 NetworkIsolation: tr.networkIsolationSpec, 966 } 967 } 968 969 // Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner 970 // but before Run so no locks need to be acquired. 971 func (tr *TaskRunner) Restore() error { 972 ls, ts, err := tr.stateDB.GetTaskRunnerState(tr.allocID, tr.taskName) 973 if err != nil { 974 return err 975 } 976 977 if ls != nil { 978 ls.Canonicalize() 979 tr.localState = ls 980 } 981 982 if ts != nil { 983 ts.Canonicalize() 984 tr.state = ts 985 } 986 987 // If a TaskHandle was persisted, ensure it is valid or destroy it. 988 if taskHandle := tr.localState.TaskHandle; taskHandle != nil { 989 //TODO if RecoverTask returned the DriverNetwork we wouldn't 990 // have to persist it at all! 991 restored := tr.restoreHandle(taskHandle, tr.localState.DriverNetwork) 992 993 // If the handle could not be restored, the alloc is 994 // non-terminal, and the task isn't a system job: wait until 995 // servers have been contacted before running. #1795 996 if restored { 997 return nil 998 } 999 1000 alloc := tr.Alloc() 1001 if tr.state.State == structs.TaskStateDead || alloc.TerminalStatus() || alloc.Job.Type == structs.JobTypeSystem { 1002 return nil 1003 } 1004 1005 tr.logger.Trace("failed to reattach to task; will not run until server is contacted") 1006 tr.waitOnServers = true 1007 1008 ev := structs.NewTaskEvent(structs.TaskRestoreFailed). 1009 SetDisplayMessage("failed to restore task; will not run until server is contacted") 1010 tr.UpdateState(structs.TaskStatePending, ev) 1011 } 1012 1013 return nil 1014 } 1015 1016 // restoreHandle ensures a TaskHandle is valid by calling Driver.RecoverTask 1017 // and sets the driver handle. If the TaskHandle is not valid, DestroyTask is 1018 // called. 1019 func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers.DriverNetwork) (success bool) { 1020 // Ensure handle is well-formed 1021 if taskHandle.Config == nil { 1022 return true 1023 } 1024 1025 if err := tr.driver.RecoverTask(taskHandle); err != nil { 1026 if tr.TaskState().State != structs.TaskStateRunning { 1027 // RecoverTask should fail if the Task wasn't running 1028 return true 1029 } 1030 1031 tr.logger.Error("error recovering task; cleaning up", 1032 "error", err, "task_id", taskHandle.Config.ID) 1033 1034 // Try to cleanup any existing task state in the plugin before restarting 1035 if err := tr.driver.DestroyTask(taskHandle.Config.ID, true); err != nil { 1036 // Ignore ErrTaskNotFound errors as ideally 1037 // this task has already been stopped and 1038 // therefore doesn't exist. 1039 if err != drivers.ErrTaskNotFound { 1040 tr.logger.Warn("error destroying unrecoverable task", 1041 "error", err, "task_id", taskHandle.Config.ID) 1042 } 1043 1044 return false 1045 } 1046 1047 return true 1048 } 1049 1050 // Update driver handle on task runner 1051 tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net)) 1052 return true 1053 } 1054 1055 // UpdateState sets the task runners allocation state and triggers a server 1056 // update. 1057 func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) { 1058 tr.stateLock.Lock() 1059 defer tr.stateLock.Unlock() 1060 1061 if event != nil { 1062 tr.logger.Trace("setting task state", "state", state, "event", event.Type) 1063 1064 // Append the event 1065 tr.appendEvent(event) 1066 } 1067 1068 // Update the state 1069 if err := tr.updateStateImpl(state); err != nil { 1070 // Only log the error as we persistence errors should not 1071 // affect task state. 1072 tr.logger.Error("error persisting task state", "error", err, "event", event, "state", state) 1073 } 1074 1075 // Notify the alloc runner of the transition 1076 tr.stateUpdater.TaskStateUpdated() 1077 } 1078 1079 // updateStateImpl updates the in-memory task state and persists to disk. 1080 func (tr *TaskRunner) updateStateImpl(state string) error { 1081 1082 // Update the task state 1083 oldState := tr.state.State 1084 taskState := tr.state 1085 taskState.State = state 1086 1087 // Handle the state transition. 1088 switch state { 1089 case structs.TaskStateRunning: 1090 // Capture the start time if it is just starting 1091 if oldState != structs.TaskStateRunning { 1092 taskState.StartedAt = time.Now().UTC() 1093 if !tr.clientConfig.DisableTaggedMetrics { 1094 metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels) 1095 } 1096 //if r.config.BackwardsCompatibleMetrics { 1097 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) 1098 //} 1099 } 1100 case structs.TaskStateDead: 1101 // Capture the finished time if not already set 1102 if taskState.FinishedAt.IsZero() { 1103 taskState.FinishedAt = time.Now().UTC() 1104 } 1105 1106 // Emitting metrics to indicate task complete and failures 1107 if taskState.Failed { 1108 if !tr.clientConfig.DisableTaggedMetrics { 1109 metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels) 1110 } 1111 //if r.config.BackwardsCompatibleMetrics { 1112 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1) 1113 //} 1114 } else { 1115 if !tr.clientConfig.DisableTaggedMetrics { 1116 metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, 1, tr.baseLabels) 1117 } 1118 //if r.config.BackwardsCompatibleMetrics { 1119 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1) 1120 //} 1121 } 1122 } 1123 1124 // Persist the state and event 1125 return tr.stateDB.PutTaskState(tr.allocID, tr.taskName, taskState) 1126 } 1127 1128 // EmitEvent appends a new TaskEvent to this task's TaskState. The actual 1129 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1130 // transition states. 1131 // Events are persisted locally and sent to the server, but errors are simply 1132 // logged. Use AppendEvent to simply add a new event. 1133 func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) { 1134 tr.stateLock.Lock() 1135 defer tr.stateLock.Unlock() 1136 1137 tr.appendEvent(event) 1138 1139 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1140 // Only a warning because the next event/state-transition will 1141 // try to persist it again. 1142 tr.logger.Warn("error persisting event", "error", err, "event", event) 1143 } 1144 1145 // Notify the alloc runner of the event 1146 tr.stateUpdater.TaskStateUpdated() 1147 } 1148 1149 // AppendEvent appends a new TaskEvent to this task's TaskState. The actual 1150 // TaskState.State (pending, running, dead) is not changed. Use UpdateState to 1151 // transition states. 1152 // Events are persisted locally and errors are simply logged. Use EmitEvent 1153 // also update AllocRunner. 1154 func (tr *TaskRunner) AppendEvent(event *structs.TaskEvent) { 1155 tr.stateLock.Lock() 1156 defer tr.stateLock.Unlock() 1157 1158 tr.appendEvent(event) 1159 1160 if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil { 1161 // Only a warning because the next event/state-transition will 1162 // try to persist it again. 1163 tr.logger.Warn("error persisting event", "error", err, "event", event) 1164 } 1165 } 1166 1167 // appendEvent to task's event slice. Caller must acquire stateLock. 1168 func (tr *TaskRunner) appendEvent(event *structs.TaskEvent) error { 1169 // Ensure the event is populated with human readable strings 1170 event.PopulateEventDisplayMessage() 1171 1172 // Propagate failure from event to task state 1173 if event.FailsTask { 1174 tr.state.Failed = true 1175 } 1176 1177 // XXX This seems like a super awkward spot for this? Why not shouldRestart? 1178 // Update restart metrics 1179 if event.Type == structs.TaskRestarting { 1180 if !tr.clientConfig.DisableTaggedMetrics { 1181 metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels) 1182 } 1183 //if r.config.BackwardsCompatibleMetrics { 1184 //metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) 1185 //} 1186 tr.state.Restarts++ 1187 tr.state.LastRestart = time.Unix(0, event.Time) 1188 } 1189 1190 // Append event to slice 1191 appendTaskEvent(tr.state, event, tr.maxEvents) 1192 1193 return nil 1194 } 1195 1196 // WaitCh is closed when TaskRunner.Run exits. 1197 func (tr *TaskRunner) WaitCh() <-chan struct{} { 1198 return tr.waitCh 1199 } 1200 1201 // Update the running allocation with a new version received from the server. 1202 // Calls Update hooks asynchronously with Run. 1203 // 1204 // This method is safe for calling concurrently with Run and does not modify 1205 // the passed in allocation. 1206 func (tr *TaskRunner) Update(update *structs.Allocation) { 1207 task := update.LookupTask(tr.taskName) 1208 if task == nil { 1209 // This should not happen and likely indicates a bug in the 1210 // server or client. 1211 tr.logger.Error("allocation update is missing task; killing", 1212 "group", update.TaskGroup) 1213 te := structs.NewTaskEvent(structs.TaskKilled). 1214 SetKillReason("update missing task"). 1215 SetFailsTask() 1216 tr.Kill(context.Background(), te) 1217 return 1218 } 1219 1220 // Update tr.alloc 1221 tr.setAlloc(update, task) 1222 1223 // Trigger update hooks if not terminal 1224 if !update.TerminalStatus() { 1225 tr.triggerUpdateHooks() 1226 } 1227 } 1228 1229 // SetNetworkIsolation is called by the PreRun allocation hook after configuring 1230 // the network isolation for the allocation 1231 func (tr *TaskRunner) SetNetworkIsolation(n *drivers.NetworkIsolationSpec) { 1232 tr.networkIsolationLock.Lock() 1233 tr.networkIsolationSpec = n 1234 tr.networkIsolationLock.Unlock() 1235 } 1236 1237 // triggerUpdate if there isn't already an update pending. Should be called 1238 // instead of calling updateHooks directly to serialize runs of update hooks. 1239 // TaskRunner state should be updated prior to triggering update hooks. 1240 // 1241 // Does not block. 1242 func (tr *TaskRunner) triggerUpdateHooks() { 1243 select { 1244 case tr.triggerUpdateCh <- struct{}{}: 1245 default: 1246 // already an update hook pending 1247 } 1248 } 1249 1250 // Shutdown TaskRunner gracefully without affecting the state of the task. 1251 // Shutdown blocks until the main Run loop exits. 1252 func (tr *TaskRunner) Shutdown() { 1253 tr.logger.Trace("shutting down") 1254 tr.shutdownCtxCancel() 1255 1256 <-tr.WaitCh() 1257 1258 // Run shutdown hooks to cleanup 1259 tr.shutdownHooks() 1260 1261 // Persist once more 1262 tr.persistLocalState() 1263 } 1264 1265 // LatestResourceUsage returns the last resource utilization datapoint 1266 // collected. May return nil if the task is not running or no resource 1267 // utilization has been collected yet. 1268 func (tr *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1269 tr.resourceUsageLock.Lock() 1270 ru := tr.resourceUsage 1271 tr.resourceUsageLock.Unlock() 1272 1273 // Look up device statistics lazily when fetched, as currently we do not emit any stats for them yet 1274 if ru != nil && tr.deviceStatsReporter != nil { 1275 deviceResources := tr.taskResources.Devices 1276 ru.ResourceUsage.DeviceStats = tr.deviceStatsReporter.LatestDeviceResourceStats(deviceResources) 1277 } 1278 return ru 1279 } 1280 1281 // UpdateStats updates and emits the latest stats from the driver. 1282 func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) { 1283 tr.resourceUsageLock.Lock() 1284 tr.resourceUsage = ru 1285 tr.resourceUsageLock.Unlock() 1286 if ru != nil { 1287 tr.emitStats(ru) 1288 } 1289 } 1290 1291 //TODO Remove Backwardscompat or use tr.Alloc()? 1292 func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { 1293 alloc := tr.Alloc() 1294 var allocatedMem float32 1295 if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { 1296 // Convert to bytes to match other memory metrics 1297 allocatedMem = float32(taskRes.Memory.MemoryMB) * 1024 * 1024 1298 } 1299 1300 if !tr.clientConfig.DisableTaggedMetrics { 1301 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1302 float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels) 1303 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"}, 1304 float32(ru.ResourceUsage.MemoryStats.Cache), tr.baseLabels) 1305 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"}, 1306 float32(ru.ResourceUsage.MemoryStats.Swap), tr.baseLabels) 1307 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "usage"}, 1308 float32(ru.ResourceUsage.MemoryStats.Usage), tr.baseLabels) 1309 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"}, 1310 float32(ru.ResourceUsage.MemoryStats.MaxUsage), tr.baseLabels) 1311 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"}, 1312 float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels) 1313 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, 1314 float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels) 1315 if allocatedMem > 0 { 1316 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "allocated"}, 1317 allocatedMem, tr.baseLabels) 1318 } 1319 } 1320 1321 if tr.clientConfig.BackwardsCompatibleMetrics { 1322 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1323 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1324 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1325 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage)) 1326 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1327 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1328 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1329 if allocatedMem > 0 { 1330 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "allocated"}, allocatedMem) 1331 } 1332 } 1333 } 1334 1335 //TODO Remove Backwardscompat or use tr.Alloc()? 1336 func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) { 1337 alloc := tr.Alloc() 1338 var allocatedCPU float32 1339 if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { 1340 allocatedCPU = float32(taskRes.Cpu.CpuShares) 1341 } 1342 1343 if !tr.clientConfig.DisableTaggedMetrics { 1344 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"}, 1345 float32(ru.ResourceUsage.CpuStats.Percent), tr.baseLabels) 1346 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"}, 1347 float32(ru.ResourceUsage.CpuStats.SystemMode), tr.baseLabels) 1348 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"}, 1349 float32(ru.ResourceUsage.CpuStats.UserMode), tr.baseLabels) 1350 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"}, 1351 float32(ru.ResourceUsage.CpuStats.ThrottledTime), tr.baseLabels) 1352 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"}, 1353 float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels) 1354 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"}, 1355 float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels) 1356 if allocatedCPU > 0 { 1357 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "allocated"}, 1358 allocatedCPU, tr.baseLabels) 1359 } 1360 } 1361 1362 if tr.clientConfig.BackwardsCompatibleMetrics { 1363 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1364 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1365 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1366 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1367 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1368 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1369 if allocatedCPU > 0 { 1370 metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "allocated"}, allocatedCPU) 1371 } 1372 } 1373 } 1374 1375 // emitStats emits resource usage stats of tasks to remote metrics collector 1376 // sinks 1377 func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1378 if !tr.clientConfig.PublishAllocationMetrics { 1379 return 1380 } 1381 1382 if ru.ResourceUsage.MemoryStats != nil { 1383 tr.setGaugeForMemory(ru) 1384 } else { 1385 tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil") 1386 } 1387 1388 if ru.ResourceUsage.CpuStats != nil { 1389 tr.setGaugeForCPU(ru) 1390 } else { 1391 tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil") 1392 } 1393 } 1394 1395 // appendTaskEvent updates the task status by appending the new event. 1396 func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent, capacity int) { 1397 if state.Events == nil { 1398 state.Events = make([]*structs.TaskEvent, 1, capacity) 1399 state.Events[0] = event 1400 return 1401 } 1402 1403 // If we hit capacity, then shift it. 1404 if len(state.Events) == capacity { 1405 old := state.Events 1406 state.Events = make([]*structs.TaskEvent, 0, capacity) 1407 state.Events = append(state.Events, old[1:]...) 1408 } 1409 1410 state.Events = append(state.Events, event) 1411 } 1412 1413 func (tr *TaskRunner) TaskExecHandler() drivermanager.TaskExecHandler { 1414 // Check it is running 1415 handle := tr.getDriverHandle() 1416 if handle == nil { 1417 return nil 1418 } 1419 return handle.ExecStreaming 1420 } 1421 1422 func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) { 1423 return tr.driver.Capabilities() 1424 } 1425 1426 func (tr *TaskRunner) SetAllocHookResources(res *cstructs.AllocHookResources) { 1427 tr.allocHookResources = res 1428 }