github.com/smithx10/nomad@v0.9.1-rc1/client/allocrunner/taskrunner/task_runner.go

github.com/smithx10/nomad@v0.9.1-rc1/client/allocrunner/taskrunner/task_runner.go (about)

     1  package taskrunner
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	metrics "github.com/armon/go-metrics"
    12  	log "github.com/hashicorp/go-hclog"
    13  	multierror "github.com/hashicorp/go-multierror"
    14  	"github.com/hashicorp/hcl2/hcldec"
    15  	"github.com/hashicorp/nomad/client/allocdir"
    16  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    17  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts"
    18  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
    19  	"github.com/hashicorp/nomad/client/config"
    20  	"github.com/hashicorp/nomad/client/consul"
    21  	"github.com/hashicorp/nomad/client/devicemanager"
    22  	cinterfaces "github.com/hashicorp/nomad/client/interfaces"
    23  	"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
    24  	cstate "github.com/hashicorp/nomad/client/state"
    25  	cstructs "github.com/hashicorp/nomad/client/structs"
    26  	"github.com/hashicorp/nomad/client/taskenv"
    27  	"github.com/hashicorp/nomad/client/vaultclient"
    28  	"github.com/hashicorp/nomad/helper/pluginutils/hclspecutils"
    29  	"github.com/hashicorp/nomad/helper/pluginutils/hclutils"
    30  	"github.com/hashicorp/nomad/helper/uuid"
    31  	"github.com/hashicorp/nomad/nomad/structs"
    32  	bstructs "github.com/hashicorp/nomad/plugins/base/structs"
    33  	"github.com/hashicorp/nomad/plugins/drivers"
    34  )
    35  
    36  const (
    37  	// defaultMaxEvents is the default max capacity for task events on the
    38  	// task state. Overrideable for testing.
    39  	defaultMaxEvents = 10
    40  
    41  	// killBackoffBaseline is the baseline time for exponential backoff while
    42  	// killing a task.
    43  	killBackoffBaseline = 5 * time.Second
    44  
    45  	// killBackoffLimit is the limit of the exponential backoff for killing
    46  	// the task.
    47  	killBackoffLimit = 2 * time.Minute
    48  
    49  	// killFailureLimit is how many times we will attempt to kill a task before
    50  	// giving up and potentially leaking resources.
    51  	killFailureLimit = 5
    52  
    53  	// triggerUpdatechCap is the capacity for the triggerUpdateCh used for
    54  	// triggering updates. It should be exactly 1 as even if multiple
    55  	// updates have come in since the last one was handled, we only need to
    56  	// handle the last one.
    57  	triggerUpdateChCap = 1
    58  )
    59  
    60  type TaskRunner struct {
    61  	// allocID, taskName, taskLeader, and taskResources are immutable so these fields may
    62  	// be accessed without locks
    63  	allocID       string
    64  	taskName      string
    65  	taskLeader    bool
    66  	taskResources *structs.AllocatedTaskResources
    67  
    68  	alloc     *structs.Allocation
    69  	allocLock sync.Mutex
    70  
    71  	clientConfig *config.Config
    72  
    73  	// stateUpdater is used to emit updated task state
    74  	stateUpdater interfaces.TaskStateHandler
    75  
    76  	// state captures the state of the task for updating the allocation
    77  	// Must acquire stateLock to access.
    78  	state *structs.TaskState
    79  
    80  	// localState captures the node-local state of the task for when the
    81  	// Nomad agent restarts.
    82  	// Must acquire stateLock to access.
    83  	localState *state.LocalState
    84  
    85  	// stateLock must be acquired when accessing state or localState.
    86  	stateLock sync.RWMutex
    87  
    88  	// stateDB is for persisting localState and taskState
    89  	stateDB cstate.StateDB
    90  
    91  	// shutdownCtx is used to exit the TaskRunner *without* affecting task state.
    92  	shutdownCtx context.Context
    93  
    94  	// shutdownCtxCancel causes the TaskRunner to exit immediately without
    95  	// affecting task state. Useful for testing or graceful agent shutdown.
    96  	shutdownCtxCancel context.CancelFunc
    97  
    98  	// killCtx is the task runner's context representing the tasks's lifecycle.
    99  	// The context is canceled when the task is killed.
   100  	killCtx context.Context
   101  
   102  	// killCtxCancel is called when killing a task.
   103  	killCtxCancel context.CancelFunc
   104  
   105  	// killErr is populated when killing a task. Access should be done use the
   106  	// getter/setter
   107  	killErr     error
   108  	killErrLock sync.Mutex
   109  
   110  	// Logger is the logger for the task runner.
   111  	logger log.Logger
   112  
   113  	// triggerUpdateCh is ticked whenever update hooks need to be run and
   114  	// must be created with cap=1 to signal a pending update and prevent
   115  	// callers from deadlocking if the receiver has exited.
   116  	triggerUpdateCh chan struct{}
   117  
   118  	// waitCh is closed when the task runner has transitioned to a terminal
   119  	// state
   120  	waitCh chan struct{}
   121  
   122  	// driver is the driver for the task.
   123  	driver drivers.DriverPlugin
   124  
   125  	// driverCapabilities is the set capabilities the driver supports
   126  	driverCapabilities *drivers.Capabilities
   127  
   128  	// taskSchema is the hcl spec for the task driver configuration
   129  	taskSchema hcldec.Spec
   130  
   131  	// handleLock guards access to handle and handleResult
   132  	handleLock sync.Mutex
   133  
   134  	// handle to the running driver
   135  	handle *DriverHandle
   136  
   137  	// task is the task being run
   138  	task     *structs.Task
   139  	taskLock sync.RWMutex
   140  
   141  	// taskDir is the directory structure for this task.
   142  	taskDir *allocdir.TaskDir
   143  
   144  	// envBuilder is used to build the task's environment
   145  	envBuilder *taskenv.Builder
   146  
   147  	// restartTracker is used to decide if the task should be restarted.
   148  	restartTracker *restarts.RestartTracker
   149  
   150  	// runnerHooks are task runner lifecycle hooks that should be run on state
   151  	// transistions.
   152  	runnerHooks []interfaces.TaskHook
   153  
   154  	// hookResources captures the resources provided by hooks
   155  	hookResources *hookResources
   156  
   157  	// consulClient is the client used by the consul service hook for
   158  	// registering services and checks
   159  	consulClient consul.ConsulServiceAPI
   160  
   161  	// vaultClient is the client to use to derive and renew Vault tokens
   162  	vaultClient vaultclient.VaultClient
   163  
   164  	// vaultToken is the current Vault token. It should be accessed with the
   165  	// getter.
   166  	vaultToken     string
   167  	vaultTokenLock sync.Mutex
   168  
   169  	// baseLabels are used when emitting tagged metrics. All task runner metrics
   170  	// will have these tags, and optionally more.
   171  	baseLabels []metrics.Label
   172  
   173  	// logmonHookConfig is used to get the paths to the stdout and stderr fifos
   174  	// to be passed to the driver for task logging
   175  	logmonHookConfig *logmonHookConfig
   176  
   177  	// resourceUsage is written via UpdateStats and read via
   178  	// LatestResourceUsage. May be nil at all times.
   179  	resourceUsage     *cstructs.TaskResourceUsage
   180  	resourceUsageLock sync.Mutex
   181  
   182  	// deviceStatsReporter is used to lookup resource usage for alloc devices
   183  	deviceStatsReporter cinterfaces.DeviceStatsReporter
   184  
   185  	// devicemanager is used to mount devices as well as lookup device
   186  	// statistics
   187  	devicemanager devicemanager.Manager
   188  
   189  	// driverManager is used to dispense driver plugins and register event
   190  	// handlers
   191  	driverManager drivermanager.Manager
   192  
   193  	// runLaunched marks whether the Run goroutine has been started. It should
   194  	// be accessed via helpers
   195  	runLaunched     bool
   196  	runLaunchedLock sync.Mutex
   197  
   198  	// maxEvents is the capacity of the TaskEvents on the TaskState.
   199  	// Defaults to defaultMaxEvents but overrideable for testing.
   200  	maxEvents int
   201  }
   202  
   203  type Config struct {
   204  	Alloc        *structs.Allocation
   205  	ClientConfig *config.Config
   206  	Consul       consul.ConsulServiceAPI
   207  	Task         *structs.Task
   208  	TaskDir      *allocdir.TaskDir
   209  	Logger       log.Logger
   210  
   211  	// Vault is the client to use to derive and renew Vault tokens
   212  	Vault vaultclient.VaultClient
   213  
   214  	// StateDB is used to store and restore state.
   215  	StateDB cstate.StateDB
   216  
   217  	// StateUpdater is used to emit updated task state
   218  	StateUpdater interfaces.TaskStateHandler
   219  
   220  	// deviceStatsReporter is used to lookup resource usage for alloc devices
   221  	DeviceStatsReporter cinterfaces.DeviceStatsReporter
   222  
   223  	// DeviceManager is used to mount devices as well as lookup device
   224  	// statistics
   225  	DeviceManager devicemanager.Manager
   226  
   227  	// DriverManager is used to dispense driver plugins and register event
   228  	// handlers
   229  	DriverManager drivermanager.Manager
   230  }
   231  
   232  func NewTaskRunner(config *Config) (*TaskRunner, error) {
   233  	// Create a context for causing the runner to exit
   234  	trCtx, trCancel := context.WithCancel(context.Background())
   235  
   236  	// Create a context for killing the runner
   237  	killCtx, killCancel := context.WithCancel(context.Background())
   238  
   239  	// Initialize the environment builder
   240  	envBuilder := taskenv.NewBuilder(
   241  		config.ClientConfig.Node,
   242  		config.Alloc,
   243  		config.Task,
   244  		config.ClientConfig.Region,
   245  	)
   246  
   247  	// Initialize state from alloc if it is set
   248  	tstate := structs.NewTaskState()
   249  	if ts := config.Alloc.TaskStates[config.Task.Name]; ts != nil {
   250  		tstate = ts.Copy()
   251  	}
   252  
   253  	tr := &TaskRunner{
   254  		alloc:               config.Alloc,
   255  		allocID:             config.Alloc.ID,
   256  		clientConfig:        config.ClientConfig,
   257  		task:                config.Task,
   258  		taskDir:             config.TaskDir,
   259  		taskName:            config.Task.Name,
   260  		taskLeader:          config.Task.Leader,
   261  		envBuilder:          envBuilder,
   262  		consulClient:        config.Consul,
   263  		vaultClient:         config.Vault,
   264  		state:               tstate,
   265  		localState:          state.NewLocalState(),
   266  		stateDB:             config.StateDB,
   267  		stateUpdater:        config.StateUpdater,
   268  		deviceStatsReporter: config.DeviceStatsReporter,
   269  		killCtx:             killCtx,
   270  		killCtxCancel:       killCancel,
   271  		shutdownCtx:         trCtx,
   272  		shutdownCtxCancel:   trCancel,
   273  		triggerUpdateCh:     make(chan struct{}, triggerUpdateChCap),
   274  		waitCh:              make(chan struct{}),
   275  		devicemanager:       config.DeviceManager,
   276  		driverManager:       config.DriverManager,
   277  		maxEvents:           defaultMaxEvents,
   278  	}
   279  
   280  	// Create the logger based on the allocation ID
   281  	tr.logger = config.Logger.Named("task_runner").With("task", config.Task.Name)
   282  
   283  	// Pull out the task's resources
   284  	ares := tr.alloc.AllocatedResources
   285  	if ares != nil {
   286  		tres, ok := ares.Tasks[tr.taskName]
   287  		if !ok {
   288  			return nil, fmt.Errorf("no task resources found on allocation")
   289  		}
   290  		tr.taskResources = tres
   291  	} else {
   292  		// COMPAT(0.10): Upgrade from old resources to new resources
   293  		// Grab the old task resources
   294  		oldTr, ok := tr.alloc.TaskResources[tr.taskName]
   295  		if !ok {
   296  			return nil, fmt.Errorf("no task resources found on allocation")
   297  		}
   298  
   299  		// Convert the old to new
   300  		tr.taskResources = &structs.AllocatedTaskResources{
   301  			Cpu: structs.AllocatedCpuResources{
   302  				CpuShares: int64(oldTr.CPU),
   303  			},
   304  			Memory: structs.AllocatedMemoryResources{
   305  				MemoryMB: int64(oldTr.MemoryMB),
   306  			},
   307  			Networks: oldTr.Networks,
   308  		}
   309  	}
   310  
   311  	// Build the restart tracker.
   312  	tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup)
   313  	if tg == nil {
   314  		tr.logger.Error("alloc missing task group")
   315  		return nil, fmt.Errorf("alloc missing task group")
   316  	}
   317  	tr.restartTracker = restarts.NewRestartTracker(tg.RestartPolicy, tr.alloc.Job.Type)
   318  
   319  	// Get the driver
   320  	if err := tr.initDriver(); err != nil {
   321  		tr.logger.Error("failed to create driver", "error", err)
   322  		return nil, err
   323  	}
   324  
   325  	// Initialize the runners hooks.
   326  	tr.initHooks()
   327  
   328  	// Initialize base labels
   329  	tr.initLabels()
   330  
   331  	// Initialize initial task received event
   332  	tr.appendEvent(structs.NewTaskEvent(structs.TaskReceived))
   333  
   334  	return tr, nil
   335  }
   336  
   337  func (tr *TaskRunner) initLabels() {
   338  	alloc := tr.Alloc()
   339  	tr.baseLabels = []metrics.Label{
   340  		{
   341  			Name:  "job",
   342  			Value: alloc.Job.Name,
   343  		},
   344  		{
   345  			Name:  "task_group",
   346  			Value: alloc.TaskGroup,
   347  		},
   348  		{
   349  			Name:  "alloc_id",
   350  			Value: tr.allocID,
   351  		},
   352  		{
   353  			Name:  "task",
   354  			Value: tr.taskName,
   355  		},
   356  	}
   357  
   358  	if tr.alloc.Job.ParentID != "" {
   359  		tr.baseLabels = append(tr.baseLabels, metrics.Label{
   360  			Name:  "parent_id",
   361  			Value: tr.alloc.Job.ParentID,
   362  		})
   363  		if strings.Contains(tr.alloc.Job.Name, "/dispatch-") {
   364  			tr.baseLabels = append(tr.baseLabels, metrics.Label{
   365  				Name:  "dispatch_id",
   366  				Value: strings.Split(tr.alloc.Job.Name, "/dispatch-")[1],
   367  			})
   368  		}
   369  		if strings.Contains(tr.alloc.Job.Name, "/periodic-") {
   370  			tr.baseLabels = append(tr.baseLabels, metrics.Label{
   371  				Name:  "periodic_id",
   372  				Value: strings.Split(tr.alloc.Job.Name, "/periodic-")[1],
   373  			})
   374  		}
   375  	}
   376  }
   377  
   378  // Run the TaskRunner. Starts the user's task or reattaches to a restored task.
   379  // Run closes WaitCh when it exits. Should be started in a goroutine.
   380  func (tr *TaskRunner) Run() {
   381  	// Mark that the run routine has been launched so that other functions can
   382  	// decide to use the wait channel or not.
   383  	tr.setRunLaunched()
   384  
   385  	defer close(tr.waitCh)
   386  	var result *drivers.ExitResult
   387  
   388  	// Updates are handled asynchronously with the other hooks but each
   389  	// triggered update - whether due to alloc updates or a new vault token
   390  	// - should be handled serially.
   391  	go tr.handleUpdates()
   392  
   393  MAIN:
   394  	for !tr.Alloc().TerminalStatus() {
   395  		select {
   396  		case <-tr.killCtx.Done():
   397  			break MAIN
   398  		case <-tr.shutdownCtx.Done():
   399  			// TaskRunner was told to exit immediately
   400  			return
   401  		default:
   402  		}
   403  
   404  		// Run the prestart hooks
   405  		if err := tr.prestart(); err != nil {
   406  			tr.logger.Error("prestart failed", "error", err)
   407  			tr.restartTracker.SetStartError(err)
   408  			goto RESTART
   409  		}
   410  
   411  		select {
   412  		case <-tr.killCtx.Done():
   413  			break MAIN
   414  		case <-tr.shutdownCtx.Done():
   415  			// TaskRunner was told to exit immediately
   416  			return
   417  		default:
   418  		}
   419  
   420  		// Run the task
   421  		if err := tr.runDriver(); err != nil {
   422  			tr.logger.Error("running driver failed", "error", err)
   423  			tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err))
   424  			tr.restartTracker.SetStartError(err)
   425  			goto RESTART
   426  		}
   427  
   428  		// Run the poststart hooks
   429  		if err := tr.poststart(); err != nil {
   430  			tr.logger.Error("poststart failed", "error", err)
   431  		}
   432  
   433  		// Grab the result proxy and wait for task to exit
   434  	WAIT:
   435  		{
   436  			handle := tr.getDriverHandle()
   437  			result = nil
   438  
   439  			// Do *not* use tr.killCtx here as it would cause
   440  			// Wait() to unblock before the task exits when Kill()
   441  			// is called.
   442  			if resultCh, err := handle.WaitCh(context.Background()); err != nil {
   443  				tr.logger.Error("wait task failed", "error", err)
   444  			} else {
   445  				select {
   446  				case <-tr.killCtx.Done():
   447  					// We can go through the normal should restart check since
   448  					// the restart tracker knowns it is killed
   449  					result = tr.handleKill()
   450  				case <-tr.shutdownCtx.Done():
   451  					// TaskRunner was told to exit immediately
   452  					return
   453  				case result = <-resultCh:
   454  				}
   455  
   456  				// WaitCh returned a result
   457  				if retryWait := tr.handleTaskExitResult(result); retryWait {
   458  					goto WAIT
   459  				}
   460  			}
   461  		}
   462  
   463  		// Clear the handle
   464  		tr.clearDriverHandle()
   465  
   466  		// Store the wait result on the restart tracker
   467  		tr.restartTracker.SetExitResult(result)
   468  
   469  		if err := tr.exited(); err != nil {
   470  			tr.logger.Error("exited hooks failed", "error", err)
   471  		}
   472  
   473  	RESTART:
   474  		restart, restartDelay := tr.shouldRestart()
   475  		if !restart {
   476  			break MAIN
   477  		}
   478  
   479  		// Actually restart by sleeping and also watching for destroy events
   480  		select {
   481  		case <-time.After(restartDelay):
   482  		case <-tr.killCtx.Done():
   483  			tr.logger.Trace("task killed between restarts", "delay", restartDelay)
   484  			break MAIN
   485  		case <-tr.shutdownCtx.Done():
   486  			// TaskRunner was told to exit immediately
   487  			tr.logger.Trace("gracefully shutting down during restart delay")
   488  			return
   489  		}
   490  	}
   491  
   492  	// Ensure handle is cleaned up. Restore could have recovered a task
   493  	// that should be terminal, so if the handle still exists we should
   494  	// kill it here.
   495  	if tr.getDriverHandle() != nil {
   496  		if result = tr.handleKill(); result != nil {
   497  			tr.emitExitResultEvent(result)
   498  		}
   499  
   500  		tr.clearDriverHandle()
   501  
   502  		if err := tr.exited(); err != nil {
   503  			tr.logger.Error("exited hooks failed while cleaning up terminal task", "error", err)
   504  		}
   505  	}
   506  
   507  	// Mark the task as dead
   508  	tr.UpdateState(structs.TaskStateDead, nil)
   509  
   510  	// Run the stop hooks
   511  	if err := tr.stop(); err != nil {
   512  		tr.logger.Error("stop failed", "error", err)
   513  	}
   514  
   515  	tr.logger.Debug("task run loop exiting")
   516  }
   517  
   518  // handleTaskExitResult handles the results returned by the task exiting. If
   519  // retryWait is true, the caller should attempt to wait on the task again since
   520  // it has not actually finished running. This can happen if the driver plugin
   521  // has exited.
   522  func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWait bool) {
   523  	if result == nil {
   524  		return false
   525  	}
   526  
   527  	if result.Err == bstructs.ErrPluginShutdown {
   528  		dn := tr.Task().Driver
   529  		tr.logger.Debug("driver plugin has shutdown; attempting to recover task", "driver", dn)
   530  
   531  		// Initialize a new driver handle
   532  		if err := tr.initDriver(); err != nil {
   533  			tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err, "driver", dn)
   534  			return false
   535  		}
   536  
   537  		// Try to restore the handle
   538  		tr.stateLock.RLock()
   539  		h := tr.localState.TaskHandle
   540  		net := tr.localState.DriverNetwork
   541  		tr.stateLock.RUnlock()
   542  		if !tr.restoreHandle(h, net) {
   543  			tr.logger.Error("failed to restore handle on driver after it exited unexpectedly", "driver", dn)
   544  			return false
   545  		}
   546  
   547  		tr.logger.Debug("task successfully recovered on driver", "driver", dn)
   548  		return true
   549  	}
   550  
   551  	// Emit Terminated event
   552  	tr.emitExitResultEvent(result)
   553  
   554  	return false
   555  }
   556  
   557  // emitExitResultEvent emits a TaskTerminated event for an ExitResult.
   558  func (tr *TaskRunner) emitExitResultEvent(result *drivers.ExitResult) {
   559  	event := structs.NewTaskEvent(structs.TaskTerminated).
   560  		SetExitCode(result.ExitCode).
   561  		SetSignal(result.Signal).
   562  		SetOOMKilled(result.OOMKilled).
   563  		SetExitMessage(result.Err)
   564  
   565  	tr.EmitEvent(event)
   566  
   567  	if result.OOMKilled && !tr.clientConfig.DisableTaggedMetrics {
   568  		metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels)
   569  	}
   570  }
   571  
   572  // handleUpdates runs update hooks when triggerUpdateCh is ticked and exits
   573  // when Run has returned. Should only be run in a goroutine from Run.
   574  func (tr *TaskRunner) handleUpdates() {
   575  	for {
   576  		select {
   577  		case <-tr.triggerUpdateCh:
   578  		case <-tr.waitCh:
   579  			return
   580  		}
   581  
   582  		// Non-terminal update; run hooks
   583  		tr.updateHooks()
   584  	}
   585  }
   586  
   587  // shouldRestart determines whether the task should be restarted and updates
   588  // the task state unless the task is killed or terminated.
   589  func (tr *TaskRunner) shouldRestart() (bool, time.Duration) {
   590  	// Determine if we should restart
   591  	state, when := tr.restartTracker.GetState()
   592  	reason := tr.restartTracker.GetReason()
   593  	switch state {
   594  	case structs.TaskKilled:
   595  		// Never restart an explicitly killed task. Kill method handles
   596  		// updating the server.
   597  		tr.EmitEvent(structs.NewTaskEvent(state))
   598  		return false, 0
   599  	case structs.TaskNotRestarting, structs.TaskTerminated:
   600  		tr.logger.Info("not restarting task", "reason", reason)
   601  		if state == structs.TaskNotRestarting {
   602  			tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting).SetRestartReason(reason).SetFailsTask())
   603  		}
   604  		return false, 0
   605  	case structs.TaskRestarting:
   606  		tr.logger.Info("restarting task", "reason", reason, "delay", when)
   607  		tr.UpdateState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when).SetRestartReason(reason))
   608  		return true, when
   609  	default:
   610  		tr.logger.Error("restart tracker returned unknown state", "state", state)
   611  		return true, when
   612  	}
   613  }
   614  
   615  // runDriver runs the driver and waits for it to exit
   616  func (tr *TaskRunner) runDriver() error {
   617  
   618  	taskConfig := tr.buildTaskConfig()
   619  
   620  	// Build hcl context variables
   621  	vars, errs, err := tr.envBuilder.Build().AllValues()
   622  	if err != nil {
   623  		return fmt.Errorf("error building environment variables: %v", err)
   624  	}
   625  
   626  	// Handle per-key errors
   627  	if len(errs) > 0 {
   628  		keys := make([]string, 0, len(errs))
   629  		for k, err := range errs {
   630  			keys = append(keys, k)
   631  
   632  			if tr.logger.IsTrace() {
   633  				// Verbosely log every diagnostic for debugging
   634  				tr.logger.Trace("error building environment variables", "key", k, "error", err)
   635  			}
   636  		}
   637  
   638  		tr.logger.Warn("some environment variables not available for rendering", "keys", strings.Join(keys, ", "))
   639  	}
   640  
   641  	val, diag := hclutils.ParseHclInterface(tr.task.Config, tr.taskSchema, vars)
   642  	if diag.HasErrors() {
   643  		return multierror.Append(errors.New("failed to parse config"), diag.Errs()...)
   644  	}
   645  
   646  	if err := taskConfig.EncodeDriverConfig(val); err != nil {
   647  		return fmt.Errorf("failed to encode driver config: %v", err)
   648  	}
   649  
   650  	// If there's already a task handle (eg from a Restore) there's nothing
   651  	// to do except update state.
   652  	if tr.getDriverHandle() != nil {
   653  		// Ensure running state is persisted but do *not* append a new
   654  		// task event as restoring is a client event and not relevant
   655  		// to a task's lifecycle.
   656  		if err := tr.updateStateImpl(structs.TaskStateRunning); err != nil {
   657  			//TODO return error and destroy task to avoid an orphaned task?
   658  			tr.logger.Warn("error persisting task state", "error", err)
   659  		}
   660  		return nil
   661  	}
   662  
   663  	// Start the job if there's no existing handle (or if RecoverTask failed)
   664  	handle, net, err := tr.driver.StartTask(taskConfig)
   665  	if err != nil {
   666  		// The plugin has died, try relaunching it
   667  		if err == bstructs.ErrPluginShutdown {
   668  			tr.logger.Info("failed to start task because plugin shutdown unexpectedly; attempting to recover")
   669  			if err := tr.initDriver(); err != nil {
   670  				return fmt.Errorf("failed to initialize driver after it exited unexpectedly: %v", err)
   671  			}
   672  
   673  			handle, net, err = tr.driver.StartTask(taskConfig)
   674  			if err != nil {
   675  				return fmt.Errorf("failed to start task after driver exited unexpectedly: %v", err)
   676  			}
   677  		} else {
   678  			// Do *NOT* wrap the error here without maintaining
   679  			// whether or not is Recoverable.
   680  			return err
   681  		}
   682  	}
   683  
   684  	tr.stateLock.Lock()
   685  	tr.localState.TaskHandle = handle
   686  	tr.localState.DriverNetwork = net
   687  	if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
   688  		//TODO Nomad will be unable to restore this task; try to kill
   689  		//     it now and fail? In general we prefer to leave running
   690  		//     tasks running even if the agent encounters an error.
   691  		tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart",
   692  			"error", err, "task_id", handle.Config.ID)
   693  	}
   694  	tr.stateLock.Unlock()
   695  
   696  	tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net))
   697  
   698  	// Emit an event that we started
   699  	tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
   700  	return nil
   701  }
   702  
   703  // initDriver retrives the DriverPlugin from the plugin loader for this task
   704  func (tr *TaskRunner) initDriver() error {
   705  	driver, err := tr.driverManager.Dispense(tr.Task().Driver)
   706  	if err != nil {
   707  		return err
   708  	}
   709  	tr.driver = driver
   710  
   711  	schema, err := tr.driver.TaskConfigSchema()
   712  	if err != nil {
   713  		return err
   714  	}
   715  	spec, diag := hclspecutils.Convert(schema)
   716  	if diag.HasErrors() {
   717  		return multierror.Append(errors.New("failed to convert task schema"), diag.Errs()...)
   718  	}
   719  	tr.taskSchema = spec
   720  
   721  	caps, err := tr.driver.Capabilities()
   722  	if err != nil {
   723  		return err
   724  	}
   725  	tr.driverCapabilities = caps
   726  
   727  	return nil
   728  }
   729  
   730  // handleKill is used to handle the a request to kill a task. It will return
   731  // the handle exit result if one is available and store any error in the task
   732  // runner killErr value.
   733  func (tr *TaskRunner) handleKill() *drivers.ExitResult {
   734  	// Run the pre killing hooks
   735  	tr.preKill()
   736  
   737  	// Tell the restart tracker that the task has been killed so it doesn't
   738  	// attempt to restart it.
   739  	tr.restartTracker.SetKilled()
   740  
   741  	// Check it is running
   742  	handle := tr.getDriverHandle()
   743  	if handle == nil {
   744  		return nil
   745  	}
   746  
   747  	// Kill the task using an exponential backoff in-case of failures.
   748  	killErr := tr.killTask(handle)
   749  	if killErr != nil {
   750  		// We couldn't successfully destroy the resource created.
   751  		tr.logger.Error("failed to kill task. Resources may have been leaked", "error", killErr)
   752  		tr.setKillErr(killErr)
   753  	}
   754  
   755  	// Block until task has exited.
   756  	waitCh, err := handle.WaitCh(tr.shutdownCtx)
   757  
   758  	// The error should be nil or TaskNotFound, if it's something else then a
   759  	// failure in the driver or transport layer occurred
   760  	if err != nil {
   761  		if err == drivers.ErrTaskNotFound {
   762  			return nil
   763  		}
   764  		tr.logger.Error("failed to wait on task. Resources may have been leaked", "error", err)
   765  		tr.setKillErr(killErr)
   766  		return nil
   767  	}
   768  
   769  	select {
   770  	case result := <-waitCh:
   771  		return result
   772  	case <-tr.shutdownCtx.Done():
   773  		return nil
   774  	}
   775  }
   776  
   777  // killTask kills the task handle. In the case that killing fails,
   778  // killTask will retry with an exponential backoff and will give up at a
   779  // given limit. Returns an error if the task could not be killed.
   780  func (tr *TaskRunner) killTask(handle *DriverHandle) error {
   781  	// Cap the number of times we attempt to kill the task.
   782  	var err error
   783  	for i := 0; i < killFailureLimit; i++ {
   784  		if err = handle.Kill(); err != nil {
   785  			if err == drivers.ErrTaskNotFound {
   786  				tr.logger.Warn("couldn't find task to kill", "task_id", handle.ID())
   787  				return nil
   788  			}
   789  			// Calculate the new backoff
   790  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
   791  			if backoff > killBackoffLimit {
   792  				backoff = killBackoffLimit
   793  			}
   794  
   795  			tr.logger.Error("failed to kill task", "backoff", backoff, "error", err)
   796  			time.Sleep(backoff)
   797  		} else {
   798  			// Kill was successful
   799  			return nil
   800  		}
   801  	}
   802  	return err
   803  }
   804  
   805  // persistLocalState persists local state to disk synchronously.
   806  func (tr *TaskRunner) persistLocalState() error {
   807  	tr.stateLock.RLock()
   808  	defer tr.stateLock.RUnlock()
   809  
   810  	return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState)
   811  }
   812  
   813  // buildTaskConfig builds a drivers.TaskConfig with an unique ID for the task.
   814  // The ID is unique for every invocation, it is built from the alloc ID, task
   815  // name and 8 random characters.
   816  func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig {
   817  	task := tr.Task()
   818  	alloc := tr.Alloc()
   819  	invocationid := uuid.Generate()[:8]
   820  	taskResources := tr.taskResources
   821  	env := tr.envBuilder.Build()
   822  
   823  	return &drivers.TaskConfig{
   824  		ID:            fmt.Sprintf("%s/%s/%s", alloc.ID, task.Name, invocationid),
   825  		Name:          task.Name,
   826  		JobName:       alloc.Job.Name,
   827  		TaskGroupName: alloc.TaskGroup,
   828  		Resources: &drivers.Resources{
   829  			NomadResources: taskResources,
   830  			LinuxResources: &drivers.LinuxResources{
   831  				MemoryLimitBytes: taskResources.Memory.MemoryMB * 1024 * 1024,
   832  				CPUShares:        taskResources.Cpu.CpuShares,
   833  				PercentTicks:     float64(taskResources.Cpu.CpuShares) / float64(tr.clientConfig.Node.NodeResources.Cpu.CpuShares),
   834  			},
   835  		},
   836  		Devices:    tr.hookResources.getDevices(),
   837  		Mounts:     tr.hookResources.getMounts(),
   838  		Env:        env.Map(),
   839  		DeviceEnv:  env.DeviceEnv(),
   840  		User:       task.User,
   841  		AllocDir:   tr.taskDir.AllocDir,
   842  		StdoutPath: tr.logmonHookConfig.stdoutFifo,
   843  		StderrPath: tr.logmonHookConfig.stderrFifo,
   844  		AllocID:    tr.allocID,
   845  	}
   846  }
   847  
   848  // Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner
   849  // but before Run so no locks need to be acquired.
   850  func (tr *TaskRunner) Restore() error {
   851  	ls, ts, err := tr.stateDB.GetTaskRunnerState(tr.allocID, tr.taskName)
   852  	if err != nil {
   853  		return err
   854  	}
   855  
   856  	if ls != nil {
   857  		ls.Canonicalize()
   858  		tr.localState = ls
   859  	}
   860  
   861  	if ts != nil {
   862  		ts.Canonicalize()
   863  		tr.state = ts
   864  	}
   865  
   866  	// If a TaskHandle was persisted, ensure it is valid or destroy it.
   867  	if taskHandle := tr.localState.TaskHandle; taskHandle != nil {
   868  		//TODO if RecoverTask returned the DriverNetwork we wouldn't
   869  		//     have to persist it at all!
   870  		tr.restoreHandle(taskHandle, tr.localState.DriverNetwork)
   871  	}
   872  	return nil
   873  }
   874  
   875  // restoreHandle ensures a TaskHandle is valid by calling Driver.RecoverTask
   876  // and sets the driver handle. If the TaskHandle is not valid, DestroyTask is
   877  // called.
   878  func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers.DriverNetwork) (success bool) {
   879  	// Ensure handle is well-formed
   880  	if taskHandle.Config == nil {
   881  		return true
   882  	}
   883  
   884  	if err := tr.driver.RecoverTask(taskHandle); err != nil {
   885  		if tr.TaskState().State != structs.TaskStateRunning {
   886  			// RecoverTask should fail if the Task wasn't running
   887  			return true
   888  		}
   889  
   890  		tr.logger.Error("error recovering task; cleaning up",
   891  			"error", err, "task_id", taskHandle.Config.ID)
   892  
   893  		// Try to cleanup any existing task state in the plugin before restarting
   894  		if err := tr.driver.DestroyTask(taskHandle.Config.ID, true); err != nil {
   895  			// Ignore ErrTaskNotFound errors as ideally
   896  			// this task has already been stopped and
   897  			// therefore doesn't exist.
   898  			if err != drivers.ErrTaskNotFound {
   899  				tr.logger.Warn("error destroying unrecoverable task",
   900  					"error", err, "task_id", taskHandle.Config.ID)
   901  			}
   902  
   903  			return false
   904  		}
   905  
   906  		return true
   907  	}
   908  
   909  	// Update driver handle on task runner
   910  	tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net))
   911  	return true
   912  }
   913  
   914  // UpdateState sets the task runners allocation state and triggers a server
   915  // update.
   916  func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) {
   917  	tr.stateLock.Lock()
   918  	defer tr.stateLock.Unlock()
   919  
   920  	if event != nil {
   921  		tr.logger.Trace("setting task state", "state", state, "event", event.Type)
   922  
   923  		// Append the event
   924  		tr.appendEvent(event)
   925  	}
   926  
   927  	// Update the state
   928  	if err := tr.updateStateImpl(state); err != nil {
   929  		// Only log the error as we persistence errors should not
   930  		// affect task state.
   931  		tr.logger.Error("error persisting task state", "error", err, "event", event, "state", state)
   932  	}
   933  
   934  	// Notify the alloc runner of the transition
   935  	tr.stateUpdater.TaskStateUpdated()
   936  }
   937  
   938  // updateStateImpl updates the in-memory task state and persists to disk.
   939  func (tr *TaskRunner) updateStateImpl(state string) error {
   940  
   941  	// Update the task state
   942  	oldState := tr.state.State
   943  	taskState := tr.state
   944  	taskState.State = state
   945  
   946  	// Handle the state transition.
   947  	switch state {
   948  	case structs.TaskStateRunning:
   949  		// Capture the start time if it is just starting
   950  		if oldState != structs.TaskStateRunning {
   951  			taskState.StartedAt = time.Now().UTC()
   952  			if !tr.clientConfig.DisableTaggedMetrics {
   953  				metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels)
   954  			}
   955  			//if r.config.BackwardsCompatibleMetrics {
   956  			//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
   957  			//}
   958  		}
   959  	case structs.TaskStateDead:
   960  		// Capture the finished time if not already set
   961  		if taskState.FinishedAt.IsZero() {
   962  			taskState.FinishedAt = time.Now().UTC()
   963  		}
   964  
   965  		// Emitting metrics to indicate task complete and failures
   966  		if taskState.Failed {
   967  			if !tr.clientConfig.DisableTaggedMetrics {
   968  				metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels)
   969  			}
   970  			//if r.config.BackwardsCompatibleMetrics {
   971  			//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
   972  			//}
   973  		} else {
   974  			if !tr.clientConfig.DisableTaggedMetrics {
   975  				metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, 1, tr.baseLabels)
   976  			}
   977  			//if r.config.BackwardsCompatibleMetrics {
   978  			//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
   979  			//}
   980  		}
   981  	}
   982  
   983  	// Persist the state and event
   984  	return tr.stateDB.PutTaskState(tr.allocID, tr.taskName, taskState)
   985  }
   986  
   987  // EmitEvent appends a new TaskEvent to this task's TaskState. The actual
   988  // TaskState.State (pending, running, dead) is not changed. Use UpdateState to
   989  // transition states.
   990  // Events are persisted locally and sent to the server, but errors are simply
   991  // logged. Use AppendEvent to simply add a new event.
   992  func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) {
   993  	tr.stateLock.Lock()
   994  	defer tr.stateLock.Unlock()
   995  
   996  	tr.appendEvent(event)
   997  
   998  	if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil {
   999  		// Only a warning because the next event/state-transition will
  1000  		// try to persist it again.
  1001  		tr.logger.Warn("error persisting event", "error", err, "event", event)
  1002  	}
  1003  
  1004  	// Notify the alloc runner of the event
  1005  	tr.stateUpdater.TaskStateUpdated()
  1006  }
  1007  
  1008  // AppendEvent appends a new TaskEvent to this task's TaskState. The actual
  1009  // TaskState.State (pending, running, dead) is not changed. Use UpdateState to
  1010  // transition states.
  1011  // Events are persisted locally and errors are simply logged. Use EmitEvent
  1012  // also update AllocRunner.
  1013  func (tr *TaskRunner) AppendEvent(event *structs.TaskEvent) {
  1014  	tr.stateLock.Lock()
  1015  	defer tr.stateLock.Unlock()
  1016  
  1017  	tr.appendEvent(event)
  1018  
  1019  	if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil {
  1020  		// Only a warning because the next event/state-transition will
  1021  		// try to persist it again.
  1022  		tr.logger.Warn("error persisting event", "error", err, "event", event)
  1023  	}
  1024  }
  1025  
  1026  // appendEvent to task's event slice. Caller must acquire stateLock.
  1027  func (tr *TaskRunner) appendEvent(event *structs.TaskEvent) error {
  1028  	// Ensure the event is populated with human readable strings
  1029  	event.PopulateEventDisplayMessage()
  1030  
  1031  	// Propagate failure from event to task state
  1032  	if event.FailsTask {
  1033  		tr.state.Failed = true
  1034  	}
  1035  
  1036  	// XXX This seems like a super awkward spot for this? Why not shouldRestart?
  1037  	// Update restart metrics
  1038  	if event.Type == structs.TaskRestarting {
  1039  		if !tr.clientConfig.DisableTaggedMetrics {
  1040  			metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels)
  1041  		}
  1042  		//if r.config.BackwardsCompatibleMetrics {
  1043  		//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
  1044  		//}
  1045  		tr.state.Restarts++
  1046  		tr.state.LastRestart = time.Unix(0, event.Time)
  1047  	}
  1048  
  1049  	// Append event to slice
  1050  	appendTaskEvent(tr.state, event, tr.maxEvents)
  1051  
  1052  	return nil
  1053  }
  1054  
  1055  // WaitCh is closed when TaskRunner.Run exits.
  1056  func (tr *TaskRunner) WaitCh() <-chan struct{} {
  1057  	return tr.waitCh
  1058  }
  1059  
  1060  // Update the running allocation with a new version received from the server.
  1061  // Calls Update hooks asynchronously with Run.
  1062  //
  1063  // This method is safe for calling concurrently with Run and does not modify
  1064  // the passed in allocation.
  1065  func (tr *TaskRunner) Update(update *structs.Allocation) {
  1066  	task := update.LookupTask(tr.taskName)
  1067  	if task == nil {
  1068  		// This should not happen and likely indicates a bug in the
  1069  		// server or client.
  1070  		tr.logger.Error("allocation update is missing task; killing",
  1071  			"group", update.TaskGroup)
  1072  		te := structs.NewTaskEvent(structs.TaskKilled).
  1073  			SetKillReason("update missing task").
  1074  			SetFailsTask()
  1075  		tr.Kill(context.Background(), te)
  1076  		return
  1077  	}
  1078  
  1079  	// Update tr.alloc
  1080  	tr.setAlloc(update, task)
  1081  
  1082  	// Trigger update hooks if not terminal
  1083  	if !update.TerminalStatus() {
  1084  		tr.triggerUpdateHooks()
  1085  	}
  1086  }
  1087  
  1088  // triggerUpdate if there isn't already an update pending. Should be called
  1089  // instead of calling updateHooks directly to serialize runs of update hooks.
  1090  // TaskRunner state should be updated prior to triggering update hooks.
  1091  //
  1092  // Does not block.
  1093  func (tr *TaskRunner) triggerUpdateHooks() {
  1094  	select {
  1095  	case tr.triggerUpdateCh <- struct{}{}:
  1096  	default:
  1097  		// already an update hook pending
  1098  	}
  1099  }
  1100  
  1101  // Shutdown TaskRunner gracefully without affecting the state of the task.
  1102  // Shutdown blocks until the main Run loop exits.
  1103  func (tr *TaskRunner) Shutdown() {
  1104  	tr.logger.Trace("shutting down")
  1105  	tr.shutdownCtxCancel()
  1106  
  1107  	<-tr.WaitCh()
  1108  
  1109  	// Run shutdown hooks to cleanup
  1110  	tr.shutdownHooks()
  1111  
  1112  	// Persist once more
  1113  	tr.persistLocalState()
  1114  }
  1115  
  1116  // LatestResourceUsage returns the last resource utilization datapoint
  1117  // collected. May return nil if the task is not running or no resource
  1118  // utilization has been collected yet.
  1119  func (tr *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1120  	tr.resourceUsageLock.Lock()
  1121  	ru := tr.resourceUsage
  1122  	tr.resourceUsageLock.Unlock()
  1123  
  1124  	// Look up device statistics lazily when fetched, as currently we do not emit any stats for them yet
  1125  	if ru != nil && tr.deviceStatsReporter != nil {
  1126  		deviceResources := tr.taskResources.Devices
  1127  		ru.ResourceUsage.DeviceStats = tr.deviceStatsReporter.LatestDeviceResourceStats(deviceResources)
  1128  	}
  1129  	return ru
  1130  }
  1131  
  1132  // UpdateStats updates and emits the latest stats from the driver.
  1133  func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) {
  1134  	tr.resourceUsageLock.Lock()
  1135  	tr.resourceUsage = ru
  1136  	tr.resourceUsageLock.Unlock()
  1137  	if ru != nil {
  1138  		tr.emitStats(ru)
  1139  	}
  1140  }
  1141  
  1142  //TODO Remove Backwardscompat or use tr.Alloc()?
  1143  func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) {
  1144  	if !tr.clientConfig.DisableTaggedMetrics {
  1145  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1146  			float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels)
  1147  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"},
  1148  			float32(ru.ResourceUsage.MemoryStats.Cache), tr.baseLabels)
  1149  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"},
  1150  			float32(ru.ResourceUsage.MemoryStats.Swap), tr.baseLabels)
  1151  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "usage"},
  1152  			float32(ru.ResourceUsage.MemoryStats.Usage), tr.baseLabels)
  1153  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"},
  1154  			float32(ru.ResourceUsage.MemoryStats.MaxUsage), tr.baseLabels)
  1155  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"},
  1156  			float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels)
  1157  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"},
  1158  			float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels)
  1159  	}
  1160  
  1161  	if tr.clientConfig.BackwardsCompatibleMetrics {
  1162  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1163  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1164  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1165  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage))
  1166  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1167  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1168  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1169  	}
  1170  }
  1171  
  1172  //TODO Remove Backwardscompat or use tr.Alloc()?
  1173  func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) {
  1174  	if !tr.clientConfig.DisableTaggedMetrics {
  1175  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"},
  1176  			float32(ru.ResourceUsage.CpuStats.Percent), tr.baseLabels)
  1177  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"},
  1178  			float32(ru.ResourceUsage.CpuStats.SystemMode), tr.baseLabels)
  1179  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"},
  1180  			float32(ru.ResourceUsage.CpuStats.UserMode), tr.baseLabels)
  1181  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"},
  1182  			float32(ru.ResourceUsage.CpuStats.ThrottledTime), tr.baseLabels)
  1183  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"},
  1184  			float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels)
  1185  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"},
  1186  			float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels)
  1187  	}
  1188  
  1189  	if tr.clientConfig.BackwardsCompatibleMetrics {
  1190  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1191  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1192  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1193  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1194  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1195  		metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1196  	}
  1197  }
  1198  
  1199  // emitStats emits resource usage stats of tasks to remote metrics collector
  1200  // sinks
  1201  func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1202  	if !tr.clientConfig.PublishAllocationMetrics {
  1203  		return
  1204  	}
  1205  
  1206  	if ru.ResourceUsage.MemoryStats != nil {
  1207  		tr.setGaugeForMemory(ru)
  1208  	}
  1209  
  1210  	if ru.ResourceUsage.CpuStats != nil {
  1211  		tr.setGaugeForCPU(ru)
  1212  	}
  1213  }
  1214  
  1215  // appendTaskEvent updates the task status by appending the new event.
  1216  func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent, capacity int) {
  1217  	if state.Events == nil {
  1218  		state.Events = make([]*structs.TaskEvent, 1, capacity)
  1219  		state.Events[0] = event
  1220  		return
  1221  	}
  1222  
  1223  	// If we hit capacity, then shift it.
  1224  	if len(state.Events) == capacity {
  1225  		old := state.Events
  1226  		state.Events = make([]*structs.TaskEvent, 0, capacity)
  1227  		state.Events = append(state.Events, old[1:]...)
  1228  	}
  1229  
  1230  	state.Events = append(state.Events, event)
  1231  }