github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/taskrunner/task_runner.go

github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/taskrunner/task_runner.go (about)

     1  package taskrunner
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	metrics "github.com/armon/go-metrics"
    12  	log "github.com/hashicorp/go-hclog"
    13  	multierror "github.com/hashicorp/go-multierror"
    14  	"github.com/hashicorp/hcl2/hcldec"
    15  	"github.com/hashicorp/nomad/client/allocdir"
    16  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    17  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts"
    18  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
    19  	"github.com/hashicorp/nomad/client/config"
    20  	"github.com/hashicorp/nomad/client/consul"
    21  	"github.com/hashicorp/nomad/client/devicemanager"
    22  	"github.com/hashicorp/nomad/client/dynamicplugins"
    23  	cinterfaces "github.com/hashicorp/nomad/client/interfaces"
    24  	"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
    25  	"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
    26  	cstate "github.com/hashicorp/nomad/client/state"
    27  	cstructs "github.com/hashicorp/nomad/client/structs"
    28  	"github.com/hashicorp/nomad/client/taskenv"
    29  	"github.com/hashicorp/nomad/client/vaultclient"
    30  	"github.com/hashicorp/nomad/helper/pluginutils/hclspecutils"
    31  	"github.com/hashicorp/nomad/helper/pluginutils/hclutils"
    32  	"github.com/hashicorp/nomad/helper/uuid"
    33  	"github.com/hashicorp/nomad/nomad/structs"
    34  	bstructs "github.com/hashicorp/nomad/plugins/base/structs"
    35  	"github.com/hashicorp/nomad/plugins/drivers"
    36  )
    37  
    38  const (
    39  	// defaultMaxEvents is the default max capacity for task events on the
    40  	// task state. Overrideable for testing.
    41  	defaultMaxEvents = 10
    42  
    43  	// killBackoffBaseline is the baseline time for exponential backoff while
    44  	// killing a task.
    45  	killBackoffBaseline = 5 * time.Second
    46  
    47  	// killBackoffLimit is the limit of the exponential backoff for killing
    48  	// the task.
    49  	killBackoffLimit = 2 * time.Minute
    50  
    51  	// killFailureLimit is how many times we will attempt to kill a task before
    52  	// giving up and potentially leaking resources.
    53  	killFailureLimit = 5
    54  
    55  	// triggerUpdateChCap is the capacity for the triggerUpdateCh used for
    56  	// triggering updates. It should be exactly 1 as even if multiple
    57  	// updates have come in since the last one was handled, we only need to
    58  	// handle the last one.
    59  	triggerUpdateChCap = 1
    60  )
    61  
    62  type TaskRunner struct {
    63  	// allocID, taskName, taskLeader, and taskResources are immutable so these fields may
    64  	// be accessed without locks
    65  	allocID       string
    66  	taskName      string
    67  	taskLeader    bool
    68  	taskResources *structs.AllocatedTaskResources
    69  
    70  	alloc     *structs.Allocation
    71  	allocLock sync.Mutex
    72  
    73  	clientConfig *config.Config
    74  
    75  	// stateUpdater is used to emit updated task state
    76  	stateUpdater interfaces.TaskStateHandler
    77  
    78  	// state captures the state of the task for updating the allocation
    79  	// Must acquire stateLock to access.
    80  	state *structs.TaskState
    81  
    82  	// localState captures the node-local state of the task for when the
    83  	// Nomad agent restarts.
    84  	// Must acquire stateLock to access.
    85  	localState *state.LocalState
    86  
    87  	// stateLock must be acquired when accessing state or localState.
    88  	stateLock sync.RWMutex
    89  
    90  	// stateDB is for persisting localState and taskState
    91  	stateDB cstate.StateDB
    92  
    93  	// shutdownCtx is used to exit the TaskRunner *without* affecting task state.
    94  	shutdownCtx context.Context
    95  
    96  	// shutdownCtxCancel causes the TaskRunner to exit immediately without
    97  	// affecting task state. Useful for testing or graceful agent shutdown.
    98  	shutdownCtxCancel context.CancelFunc
    99  
   100  	// killCtx is the task runner's context representing the tasks's lifecycle.
   101  	// The context is canceled when the task is killed.
   102  	killCtx context.Context
   103  
   104  	// killCtxCancel is called when killing a task.
   105  	killCtxCancel context.CancelFunc
   106  
   107  	// killErr is populated when killing a task. Access should be done use the
   108  	// getter/setter
   109  	killErr     error
   110  	killErrLock sync.Mutex
   111  
   112  	// Logger is the logger for the task runner.
   113  	logger log.Logger
   114  
   115  	// triggerUpdateCh is ticked whenever update hooks need to be run and
   116  	// must be created with cap=1 to signal a pending update and prevent
   117  	// callers from deadlocking if the receiver has exited.
   118  	triggerUpdateCh chan struct{}
   119  
   120  	// waitCh is closed when the task runner has transitioned to a terminal
   121  	// state
   122  	waitCh chan struct{}
   123  
   124  	// driver is the driver for the task.
   125  	driver drivers.DriverPlugin
   126  
   127  	// driverCapabilities is the set capabilities the driver supports
   128  	driverCapabilities *drivers.Capabilities
   129  
   130  	// taskSchema is the hcl spec for the task driver configuration
   131  	taskSchema hcldec.Spec
   132  
   133  	// handleLock guards access to handle and handleResult
   134  	handleLock sync.Mutex
   135  
   136  	// handle to the running driver
   137  	handle *DriverHandle
   138  
   139  	// task is the task being run
   140  	task     *structs.Task
   141  	taskLock sync.RWMutex
   142  
   143  	// taskDir is the directory structure for this task.
   144  	taskDir *allocdir.TaskDir
   145  
   146  	// envBuilder is used to build the task's environment
   147  	envBuilder *taskenv.Builder
   148  
   149  	// restartTracker is used to decide if the task should be restarted.
   150  	restartTracker *restarts.RestartTracker
   151  
   152  	// runnerHooks are task runner lifecycle hooks that should be run on state
   153  	// transistions.
   154  	runnerHooks []interfaces.TaskHook
   155  
   156  	// hookResources captures the resources provided by hooks
   157  	hookResources *hookResources
   158  
   159  	// consulClient is the client used by the consul service hook for
   160  	// registering services and checks
   161  	consulClient consul.ConsulServiceAPI
   162  
   163  	// sidsClient is the client used by the service identity hook for managing
   164  	// service identity tokens
   165  	siClient consul.ServiceIdentityAPI
   166  
   167  	// vaultClient is the client to use to derive and renew Vault tokens
   168  	vaultClient vaultclient.VaultClient
   169  
   170  	// vaultToken is the current Vault token. It should be accessed with the
   171  	// getter.
   172  	vaultToken     string
   173  	vaultTokenLock sync.Mutex
   174  
   175  	// baseLabels are used when emitting tagged metrics. All task runner metrics
   176  	// will have these tags, and optionally more.
   177  	baseLabels []metrics.Label
   178  
   179  	// logmonHookConfig is used to get the paths to the stdout and stderr fifos
   180  	// to be passed to the driver for task logging
   181  	logmonHookConfig *logmonHookConfig
   182  
   183  	// resourceUsage is written via UpdateStats and read via
   184  	// LatestResourceUsage. May be nil at all times.
   185  	resourceUsage     *cstructs.TaskResourceUsage
   186  	resourceUsageLock sync.Mutex
   187  
   188  	// deviceStatsReporter is used to lookup resource usage for alloc devices
   189  	deviceStatsReporter cinterfaces.DeviceStatsReporter
   190  
   191  	// csiManager is used to manage the mounting of CSI volumes into tasks
   192  	csiManager csimanager.Manager
   193  
   194  	// devicemanager is used to mount devices as well as lookup device
   195  	// statistics
   196  	devicemanager devicemanager.Manager
   197  
   198  	// driverManager is used to dispense driver plugins and register event
   199  	// handlers
   200  	driverManager drivermanager.Manager
   201  
   202  	// dynamicRegistry is where dynamic plugins should be registered.
   203  	dynamicRegistry dynamicplugins.Registry
   204  
   205  	// maxEvents is the capacity of the TaskEvents on the TaskState.
   206  	// Defaults to defaultMaxEvents but overrideable for testing.
   207  	maxEvents int
   208  
   209  	// serversContactedCh is passed to TaskRunners so they can detect when
   210  	// GetClientAllocs has been called in case of a failed restore.
   211  	serversContactedCh <-chan struct{}
   212  
   213  	// startConditionMetCtx is done when TR should start the task
   214  	startConditionMetCtx <-chan struct{}
   215  
   216  	// waitOnServers defaults to false but will be set true if a restore
   217  	// fails and the Run method should wait until serversContactedCh is
   218  	// closed.
   219  	waitOnServers bool
   220  
   221  	networkIsolationLock sync.Mutex
   222  	networkIsolationSpec *drivers.NetworkIsolationSpec
   223  
   224  	allocHookResources *cstructs.AllocHookResources
   225  }
   226  
   227  type Config struct {
   228  	Alloc        *structs.Allocation
   229  	ClientConfig *config.Config
   230  	Task         *structs.Task
   231  	TaskDir      *allocdir.TaskDir
   232  	Logger       log.Logger
   233  
   234  	// Consul is the client to use for managing Consul service registrations
   235  	Consul consul.ConsulServiceAPI
   236  
   237  	// ConsulSI is the client to use for managing Consul SI tokens
   238  	ConsulSI consul.ServiceIdentityAPI
   239  
   240  	// DynamicRegistry is where dynamic plugins should be registered.
   241  	DynamicRegistry dynamicplugins.Registry
   242  
   243  	// Vault is the client to use to derive and renew Vault tokens
   244  	Vault vaultclient.VaultClient
   245  
   246  	// StateDB is used to store and restore state.
   247  	StateDB cstate.StateDB
   248  
   249  	// StateUpdater is used to emit updated task state
   250  	StateUpdater interfaces.TaskStateHandler
   251  
   252  	// deviceStatsReporter is used to lookup resource usage for alloc devices
   253  	DeviceStatsReporter cinterfaces.DeviceStatsReporter
   254  
   255  	// CSIManager is used to manage the mounting of CSI volumes into tasks
   256  	CSIManager csimanager.Manager
   257  
   258  	// DeviceManager is used to mount devices as well as lookup device
   259  	// statistics
   260  	DeviceManager devicemanager.Manager
   261  
   262  	// DriverManager is used to dispense driver plugins and register event
   263  	// handlers
   264  	DriverManager drivermanager.Manager
   265  
   266  	// ServersContactedCh is closed when the first GetClientAllocs call to
   267  	// servers succeeds and allocs are synced.
   268  	ServersContactedCh chan struct{}
   269  
   270  	// startConditionMetCtx is done when TR should start the task
   271  	StartConditionMetCtx <-chan struct{}
   272  }
   273  
   274  func NewTaskRunner(config *Config) (*TaskRunner, error) {
   275  	// Create a context for causing the runner to exit
   276  	trCtx, trCancel := context.WithCancel(context.Background())
   277  
   278  	// Create a context for killing the runner
   279  	killCtx, killCancel := context.WithCancel(context.Background())
   280  
   281  	// Initialize the environment builder
   282  	envBuilder := taskenv.NewBuilder(
   283  		config.ClientConfig.Node,
   284  		config.Alloc,
   285  		config.Task,
   286  		config.ClientConfig.Region,
   287  	)
   288  
   289  	// Initialize state from alloc if it is set
   290  	tstate := structs.NewTaskState()
   291  	if ts := config.Alloc.TaskStates[config.Task.Name]; ts != nil {
   292  		tstate = ts.Copy()
   293  	}
   294  
   295  	tr := &TaskRunner{
   296  		alloc:                config.Alloc,
   297  		allocID:              config.Alloc.ID,
   298  		clientConfig:         config.ClientConfig,
   299  		task:                 config.Task,
   300  		taskDir:              config.TaskDir,
   301  		taskName:             config.Task.Name,
   302  		taskLeader:           config.Task.Leader,
   303  		envBuilder:           envBuilder,
   304  		dynamicRegistry:      config.DynamicRegistry,
   305  		consulClient:         config.Consul,
   306  		siClient:             config.ConsulSI,
   307  		vaultClient:          config.Vault,
   308  		state:                tstate,
   309  		localState:           state.NewLocalState(),
   310  		stateDB:              config.StateDB,
   311  		stateUpdater:         config.StateUpdater,
   312  		deviceStatsReporter:  config.DeviceStatsReporter,
   313  		killCtx:              killCtx,
   314  		killCtxCancel:        killCancel,
   315  		shutdownCtx:          trCtx,
   316  		shutdownCtxCancel:    trCancel,
   317  		triggerUpdateCh:      make(chan struct{}, triggerUpdateChCap),
   318  		waitCh:               make(chan struct{}),
   319  		csiManager:           config.CSIManager,
   320  		devicemanager:        config.DeviceManager,
   321  		driverManager:        config.DriverManager,
   322  		maxEvents:            defaultMaxEvents,
   323  		serversContactedCh:   config.ServersContactedCh,
   324  		startConditionMetCtx: config.StartConditionMetCtx,
   325  	}
   326  
   327  	// Create the logger based on the allocation ID
   328  	tr.logger = config.Logger.Named("task_runner").With("task", config.Task.Name)
   329  
   330  	// Pull out the task's resources
   331  	ares := tr.alloc.AllocatedResources
   332  	if ares == nil {
   333  		return nil, fmt.Errorf("no task resources found on allocation")
   334  	}
   335  
   336  	tres, ok := ares.Tasks[tr.taskName]
   337  	if !ok {
   338  		return nil, fmt.Errorf("no task resources found on allocation")
   339  	}
   340  	tr.taskResources = tres
   341  
   342  	// Build the restart tracker.
   343  	rp := config.Task.RestartPolicy
   344  	if rp == nil {
   345  		tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup)
   346  		if tg == nil {
   347  			tr.logger.Error("alloc missing task group")
   348  			return nil, fmt.Errorf("alloc missing task group")
   349  		}
   350  		rp = tg.RestartPolicy
   351  	}
   352  	tr.restartTracker = restarts.NewRestartTracker(rp, tr.alloc.Job.Type, config.Task.Lifecycle)
   353  
   354  	// Get the driver
   355  	if err := tr.initDriver(); err != nil {
   356  		tr.logger.Error("failed to create driver", "error", err)
   357  		return nil, err
   358  	}
   359  
   360  	// Initialize the runners hooks.
   361  	tr.initHooks()
   362  
   363  	// Initialize base labels
   364  	tr.initLabels()
   365  
   366  	// Initialize initial task received event
   367  	tr.appendEvent(structs.NewTaskEvent(structs.TaskReceived))
   368  
   369  	return tr, nil
   370  }
   371  
   372  func (tr *TaskRunner) initLabels() {
   373  	alloc := tr.Alloc()
   374  	tr.baseLabels = []metrics.Label{
   375  		{
   376  			Name:  "job",
   377  			Value: alloc.Job.Name,
   378  		},
   379  		{
   380  			Name:  "task_group",
   381  			Value: alloc.TaskGroup,
   382  		},
   383  		{
   384  			Name:  "alloc_id",
   385  			Value: tr.allocID,
   386  		},
   387  		{
   388  			Name:  "task",
   389  			Value: tr.taskName,
   390  		},
   391  		{
   392  			Name:  "namespace",
   393  			Value: tr.alloc.Namespace,
   394  		},
   395  	}
   396  
   397  	if tr.alloc.Job.ParentID != "" {
   398  		tr.baseLabels = append(tr.baseLabels, metrics.Label{
   399  			Name:  "parent_id",
   400  			Value: tr.alloc.Job.ParentID,
   401  		})
   402  		if strings.Contains(tr.alloc.Job.Name, "/dispatch-") {
   403  			tr.baseLabels = append(tr.baseLabels, metrics.Label{
   404  				Name:  "dispatch_id",
   405  				Value: strings.Split(tr.alloc.Job.Name, "/dispatch-")[1],
   406  			})
   407  		}
   408  		if strings.Contains(tr.alloc.Job.Name, "/periodic-") {
   409  			tr.baseLabels = append(tr.baseLabels, metrics.Label{
   410  				Name:  "periodic_id",
   411  				Value: strings.Split(tr.alloc.Job.Name, "/periodic-")[1],
   412  			})
   413  		}
   414  	}
   415  }
   416  
   417  // Mark a task as failed and not to run.  Aimed to be invoked when alloc runner
   418  // prestart hooks failed.
   419  // Should never be called with Run().
   420  func (tr *TaskRunner) MarkFailedDead(reason string) {
   421  	defer close(tr.waitCh)
   422  
   423  	tr.stateLock.Lock()
   424  	if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
   425  		//TODO Nomad will be unable to restore this task; try to kill
   426  		//     it now and fail? In general we prefer to leave running
   427  		//     tasks running even if the agent encounters an error.
   428  		tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart",
   429  			"error", err)
   430  	}
   431  	tr.stateLock.Unlock()
   432  
   433  	event := structs.NewTaskEvent(structs.TaskSetupFailure).
   434  		SetDisplayMessage(reason).
   435  		SetFailsTask()
   436  	tr.UpdateState(structs.TaskStateDead, event)
   437  
   438  	// Run the stop hooks in case task was a restored task that failed prestart
   439  	if err := tr.stop(); err != nil {
   440  		tr.logger.Error("stop failed while marking task dead", "error", err)
   441  	}
   442  }
   443  
   444  // Run the TaskRunner. Starts the user's task or reattaches to a restored task.
   445  // Run closes WaitCh when it exits. Should be started in a goroutine.
   446  func (tr *TaskRunner) Run() {
   447  	defer close(tr.waitCh)
   448  	var result *drivers.ExitResult
   449  
   450  	tr.stateLock.RLock()
   451  	dead := tr.state.State == structs.TaskStateDead
   452  	tr.stateLock.RUnlock()
   453  
   454  	// if restoring a dead task, ensure that task is cleared and all post hooks
   455  	// are called without additional state updates
   456  	if dead {
   457  		// do cleanup functions without emitting any additional events/work
   458  		// to handle cases where we restored a dead task where client terminated
   459  		// after task finished before completing post-run actions.
   460  		tr.clearDriverHandle()
   461  		tr.stateUpdater.TaskStateUpdated()
   462  		if err := tr.stop(); err != nil {
   463  			tr.logger.Error("stop failed on terminal task", "error", err)
   464  		}
   465  		return
   466  	}
   467  
   468  	// Updates are handled asynchronously with the other hooks but each
   469  	// triggered update - whether due to alloc updates or a new vault token
   470  	// - should be handled serially.
   471  	go tr.handleUpdates()
   472  
   473  	// If restore failed wait until servers are contacted before running.
   474  	// #1795
   475  	if tr.waitOnServers {
   476  		tr.logger.Info("task failed to restore; waiting to contact server before restarting")
   477  		select {
   478  		case <-tr.killCtx.Done():
   479  		case <-tr.shutdownCtx.Done():
   480  			return
   481  		case <-tr.serversContactedCh:
   482  			tr.logger.Info("server contacted; unblocking waiting task")
   483  		}
   484  	}
   485  
   486  	select {
   487  	case <-tr.startConditionMetCtx:
   488  		// yay proceed
   489  	case <-tr.killCtx.Done():
   490  	case <-tr.shutdownCtx.Done():
   491  		return
   492  	}
   493  
   494  MAIN:
   495  	for !tr.Alloc().TerminalStatus() {
   496  		select {
   497  		case <-tr.killCtx.Done():
   498  			break MAIN
   499  		case <-tr.shutdownCtx.Done():
   500  			// TaskRunner was told to exit immediately
   501  			return
   502  		default:
   503  		}
   504  
   505  		// Run the prestart hooks
   506  		if err := tr.prestart(); err != nil {
   507  			tr.logger.Error("prestart failed", "error", err)
   508  			tr.restartTracker.SetStartError(err)
   509  			goto RESTART
   510  		}
   511  
   512  		select {
   513  		case <-tr.killCtx.Done():
   514  			break MAIN
   515  		case <-tr.shutdownCtx.Done():
   516  			// TaskRunner was told to exit immediately
   517  			return
   518  		default:
   519  		}
   520  
   521  		// Run the task
   522  		if err := tr.runDriver(); err != nil {
   523  			tr.logger.Error("running driver failed", "error", err)
   524  			tr.restartTracker.SetStartError(err)
   525  			goto RESTART
   526  		}
   527  
   528  		// Run the poststart hooks
   529  		if err := tr.poststart(); err != nil {
   530  			tr.logger.Error("poststart failed", "error", err)
   531  		}
   532  
   533  		// Grab the result proxy and wait for task to exit
   534  	WAIT:
   535  		{
   536  			handle := tr.getDriverHandle()
   537  			result = nil
   538  
   539  			// Do *not* use tr.killCtx here as it would cause
   540  			// Wait() to unblock before the task exits when Kill()
   541  			// is called.
   542  			if resultCh, err := handle.WaitCh(context.Background()); err != nil {
   543  				tr.logger.Error("wait task failed", "error", err)
   544  			} else {
   545  				select {
   546  				case <-tr.killCtx.Done():
   547  					// We can go through the normal should restart check since
   548  					// the restart tracker knowns it is killed
   549  					result = tr.handleKill()
   550  				case <-tr.shutdownCtx.Done():
   551  					// TaskRunner was told to exit immediately
   552  					return
   553  				case result = <-resultCh:
   554  				}
   555  
   556  				// WaitCh returned a result
   557  				if retryWait := tr.handleTaskExitResult(result); retryWait {
   558  					goto WAIT
   559  				}
   560  			}
   561  		}
   562  
   563  		// Clear the handle
   564  		tr.clearDriverHandle()
   565  
   566  		// Store the wait result on the restart tracker
   567  		tr.restartTracker.SetExitResult(result)
   568  
   569  		if err := tr.exited(); err != nil {
   570  			tr.logger.Error("exited hooks failed", "error", err)
   571  		}
   572  
   573  	RESTART:
   574  		restart, restartDelay := tr.shouldRestart()
   575  		if !restart {
   576  			break MAIN
   577  		}
   578  
   579  		// Actually restart by sleeping and also watching for destroy events
   580  		select {
   581  		case <-time.After(restartDelay):
   582  		case <-tr.killCtx.Done():
   583  			tr.logger.Trace("task killed between restarts", "delay", restartDelay)
   584  			break MAIN
   585  		case <-tr.shutdownCtx.Done():
   586  			// TaskRunner was told to exit immediately
   587  			tr.logger.Trace("gracefully shutting down during restart delay")
   588  			return
   589  		}
   590  	}
   591  
   592  	// Ensure handle is cleaned up. Restore could have recovered a task
   593  	// that should be terminal, so if the handle still exists we should
   594  	// kill it here.
   595  	if tr.getDriverHandle() != nil {
   596  		if result = tr.handleKill(); result != nil {
   597  			tr.emitExitResultEvent(result)
   598  		}
   599  
   600  		tr.clearDriverHandle()
   601  
   602  		if err := tr.exited(); err != nil {
   603  			tr.logger.Error("exited hooks failed while cleaning up terminal task", "error", err)
   604  		}
   605  	}
   606  
   607  	// Mark the task as dead
   608  	tr.UpdateState(structs.TaskStateDead, nil)
   609  
   610  	// Run the stop hooks
   611  	if err := tr.stop(); err != nil {
   612  		tr.logger.Error("stop failed", "error", err)
   613  	}
   614  
   615  	tr.logger.Debug("task run loop exiting")
   616  }
   617  
   618  // handleTaskExitResult handles the results returned by the task exiting. If
   619  // retryWait is true, the caller should attempt to wait on the task again since
   620  // it has not actually finished running. This can happen if the driver plugin
   621  // has exited.
   622  func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWait bool) {
   623  	if result == nil {
   624  		return false
   625  	}
   626  
   627  	if result.Err == bstructs.ErrPluginShutdown {
   628  		dn := tr.Task().Driver
   629  		tr.logger.Debug("driver plugin has shutdown; attempting to recover task", "driver", dn)
   630  
   631  		// Initialize a new driver handle
   632  		if err := tr.initDriver(); err != nil {
   633  			tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err, "driver", dn)
   634  			return false
   635  		}
   636  
   637  		// Try to restore the handle
   638  		tr.stateLock.RLock()
   639  		h := tr.localState.TaskHandle
   640  		net := tr.localState.DriverNetwork
   641  		tr.stateLock.RUnlock()
   642  		if !tr.restoreHandle(h, net) {
   643  			tr.logger.Error("failed to restore handle on driver after it exited unexpectedly", "driver", dn)
   644  			return false
   645  		}
   646  
   647  		tr.logger.Debug("task successfully recovered on driver", "driver", dn)
   648  		return true
   649  	}
   650  
   651  	// Emit Terminated event
   652  	tr.emitExitResultEvent(result)
   653  
   654  	return false
   655  }
   656  
   657  // emitExitResultEvent emits a TaskTerminated event for an ExitResult.
   658  func (tr *TaskRunner) emitExitResultEvent(result *drivers.ExitResult) {
   659  	event := structs.NewTaskEvent(structs.TaskTerminated).
   660  		SetExitCode(result.ExitCode).
   661  		SetSignal(result.Signal).
   662  		SetOOMKilled(result.OOMKilled).
   663  		SetExitMessage(result.Err)
   664  
   665  	tr.EmitEvent(event)
   666  
   667  	if result.OOMKilled && !tr.clientConfig.DisableTaggedMetrics {
   668  		metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels)
   669  	}
   670  }
   671  
   672  // handleUpdates runs update hooks when triggerUpdateCh is ticked and exits
   673  // when Run has returned. Should only be run in a goroutine from Run.
   674  func (tr *TaskRunner) handleUpdates() {
   675  	for {
   676  		select {
   677  		case <-tr.triggerUpdateCh:
   678  		case <-tr.waitCh:
   679  			return
   680  		}
   681  
   682  		// Non-terminal update; run hooks
   683  		tr.updateHooks()
   684  	}
   685  }
   686  
   687  // shouldRestart determines whether the task should be restarted and updates
   688  // the task state unless the task is killed or terminated.
   689  func (tr *TaskRunner) shouldRestart() (bool, time.Duration) {
   690  	// Determine if we should restart
   691  	state, when := tr.restartTracker.GetState()
   692  	reason := tr.restartTracker.GetReason()
   693  	switch state {
   694  	case structs.TaskKilled:
   695  		// Never restart an explicitly killed task. Kill method handles
   696  		// updating the server.
   697  		tr.EmitEvent(structs.NewTaskEvent(state))
   698  		return false, 0
   699  	case structs.TaskNotRestarting, structs.TaskTerminated:
   700  		tr.logger.Info("not restarting task", "reason", reason)
   701  		if state == structs.TaskNotRestarting {
   702  			tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting).SetRestartReason(reason).SetFailsTask())
   703  		}
   704  		return false, 0
   705  	case structs.TaskRestarting:
   706  		tr.logger.Info("restarting task", "reason", reason, "delay", when)
   707  		tr.UpdateState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when).SetRestartReason(reason))
   708  		return true, when
   709  	default:
   710  		tr.logger.Error("restart tracker returned unknown state", "state", state)
   711  		return true, when
   712  	}
   713  }
   714  
   715  // runDriver runs the driver and waits for it to exit
   716  // runDriver emits an appropriate task event on success/failure
   717  func (tr *TaskRunner) runDriver() error {
   718  
   719  	taskConfig := tr.buildTaskConfig()
   720  
   721  	// Build hcl context variables
   722  	vars, errs, err := tr.envBuilder.Build().AllValues()
   723  	if err != nil {
   724  		return fmt.Errorf("error building environment variables: %v", err)
   725  	}
   726  
   727  	// Handle per-key errors
   728  	if len(errs) > 0 {
   729  		keys := make([]string, 0, len(errs))
   730  		for k, err := range errs {
   731  			keys = append(keys, k)
   732  
   733  			if tr.logger.IsTrace() {
   734  				// Verbosely log every diagnostic for debugging
   735  				tr.logger.Trace("error building environment variables", "key", k, "error", err)
   736  			}
   737  		}
   738  
   739  		tr.logger.Warn("some environment variables not available for rendering", "keys", strings.Join(keys, ", "))
   740  	}
   741  
   742  	val, diag, diagErrs := hclutils.ParseHclInterface(tr.task.Config, tr.taskSchema, vars)
   743  	if diag.HasErrors() {
   744  		parseErr := multierror.Append(errors.New("failed to parse config: "), diagErrs...)
   745  		tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(parseErr))
   746  		return parseErr
   747  	}
   748  
   749  	if err := taskConfig.EncodeDriverConfig(val); err != nil {
   750  		encodeErr := fmt.Errorf("failed to encode driver config: %v", err)
   751  		tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(encodeErr))
   752  		return encodeErr
   753  	}
   754  
   755  	// If there's already a task handle (eg from a Restore) there's nothing
   756  	// to do except update state.
   757  	if tr.getDriverHandle() != nil {
   758  		// Ensure running state is persisted but do *not* append a new
   759  		// task event as restoring is a client event and not relevant
   760  		// to a task's lifecycle.
   761  		if err := tr.updateStateImpl(structs.TaskStateRunning); err != nil {
   762  			//TODO return error and destroy task to avoid an orphaned task?
   763  			tr.logger.Warn("error persisting task state", "error", err)
   764  		}
   765  		return nil
   766  	}
   767  
   768  	// Start the job if there's no existing handle (or if RecoverTask failed)
   769  	handle, net, err := tr.driver.StartTask(taskConfig)
   770  	if err != nil {
   771  		// The plugin has died, try relaunching it
   772  		if err == bstructs.ErrPluginShutdown {
   773  			tr.logger.Info("failed to start task because plugin shutdown unexpectedly; attempting to recover")
   774  			if err := tr.initDriver(); err != nil {
   775  				taskErr := fmt.Errorf("failed to initialize driver after it exited unexpectedly: %v", err)
   776  				tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr))
   777  				return taskErr
   778  			}
   779  
   780  			handle, net, err = tr.driver.StartTask(taskConfig)
   781  			if err != nil {
   782  				taskErr := fmt.Errorf("failed to start task after driver exited unexpectedly: %v", err)
   783  				tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr))
   784  				return taskErr
   785  			}
   786  		} else {
   787  			// Do *NOT* wrap the error here without maintaining whether or not is Recoverable.
   788  			// You must emit a task event failure to be considered Recoverable
   789  			tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err))
   790  			return err
   791  		}
   792  	}
   793  
   794  	tr.stateLock.Lock()
   795  	tr.localState.TaskHandle = handle
   796  	tr.localState.DriverNetwork = net
   797  	if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
   798  		//TODO Nomad will be unable to restore this task; try to kill
   799  		//     it now and fail? In general we prefer to leave running
   800  		//     tasks running even if the agent encounters an error.
   801  		tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart",
   802  			"error", err, "task_id", handle.Config.ID)
   803  	}
   804  	tr.stateLock.Unlock()
   805  
   806  	tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net))
   807  
   808  	// Emit an event that we started
   809  	tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
   810  	return nil
   811  }
   812  
   813  // initDriver retrives the DriverPlugin from the plugin loader for this task
   814  func (tr *TaskRunner) initDriver() error {
   815  	driver, err := tr.driverManager.Dispense(tr.Task().Driver)
   816  	if err != nil {
   817  		return err
   818  	}
   819  	tr.driver = driver
   820  
   821  	schema, err := tr.driver.TaskConfigSchema()
   822  	if err != nil {
   823  		return err
   824  	}
   825  	spec, diag := hclspecutils.Convert(schema)
   826  	if diag.HasErrors() {
   827  		return multierror.Append(errors.New("failed to convert task schema"), diag.Errs()...)
   828  	}
   829  	tr.taskSchema = spec
   830  
   831  	caps, err := tr.driver.Capabilities()
   832  	if err != nil {
   833  		return err
   834  	}
   835  	tr.driverCapabilities = caps
   836  
   837  	return nil
   838  }
   839  
   840  // handleKill is used to handle the a request to kill a task. It will return
   841  // the handle exit result if one is available and store any error in the task
   842  // runner killErr value.
   843  func (tr *TaskRunner) handleKill() *drivers.ExitResult {
   844  	// Run the pre killing hooks
   845  	tr.preKill()
   846  
   847  	// Wait for task ShutdownDelay after running prekill hooks
   848  	// This allows for things like service de-registration to run
   849  	// before waiting to kill task
   850  	if delay := tr.Task().ShutdownDelay; delay != 0 {
   851  		tr.logger.Debug("waiting before killing task", "shutdown_delay", delay)
   852  		time.Sleep(delay)
   853  	}
   854  
   855  	// Tell the restart tracker that the task has been killed so it doesn't
   856  	// attempt to restart it.
   857  	tr.restartTracker.SetKilled()
   858  
   859  	// Check it is running
   860  	handle := tr.getDriverHandle()
   861  	if handle == nil {
   862  		return nil
   863  	}
   864  
   865  	// Kill the task using an exponential backoff in-case of failures.
   866  	killErr := tr.killTask(handle)
   867  	if killErr != nil {
   868  		// We couldn't successfully destroy the resource created.
   869  		tr.logger.Error("failed to kill task. Resources may have been leaked", "error", killErr)
   870  		tr.setKillErr(killErr)
   871  	}
   872  
   873  	// Block until task has exited.
   874  	waitCh, err := handle.WaitCh(tr.shutdownCtx)
   875  
   876  	// The error should be nil or TaskNotFound, if it's something else then a
   877  	// failure in the driver or transport layer occurred
   878  	if err != nil {
   879  		if err == drivers.ErrTaskNotFound {
   880  			return nil
   881  		}
   882  		tr.logger.Error("failed to wait on task. Resources may have been leaked", "error", err)
   883  		tr.setKillErr(killErr)
   884  		return nil
   885  	}
   886  
   887  	select {
   888  	case result := <-waitCh:
   889  		return result
   890  	case <-tr.shutdownCtx.Done():
   891  		return nil
   892  	}
   893  }
   894  
   895  // killTask kills the task handle. In the case that killing fails,
   896  // killTask will retry with an exponential backoff and will give up at a
   897  // given limit. Returns an error if the task could not be killed.
   898  func (tr *TaskRunner) killTask(handle *DriverHandle) error {
   899  	// Cap the number of times we attempt to kill the task.
   900  	var err error
   901  	for i := 0; i < killFailureLimit; i++ {
   902  		if err = handle.Kill(); err != nil {
   903  			if err == drivers.ErrTaskNotFound {
   904  				tr.logger.Warn("couldn't find task to kill", "task_id", handle.ID())
   905  				return nil
   906  			}
   907  			// Calculate the new backoff
   908  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
   909  			if backoff > killBackoffLimit {
   910  				backoff = killBackoffLimit
   911  			}
   912  
   913  			tr.logger.Error("failed to kill task", "backoff", backoff, "error", err)
   914  			time.Sleep(backoff)
   915  		} else {
   916  			// Kill was successful
   917  			return nil
   918  		}
   919  	}
   920  	return err
   921  }
   922  
   923  // persistLocalState persists local state to disk synchronously.
   924  func (tr *TaskRunner) persistLocalState() error {
   925  	tr.stateLock.RLock()
   926  	defer tr.stateLock.RUnlock()
   927  
   928  	return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState)
   929  }
   930  
   931  // buildTaskConfig builds a drivers.TaskConfig with an unique ID for the task.
   932  // The ID is unique for every invocation, it is built from the alloc ID, task
   933  // name and 8 random characters.
   934  func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig {
   935  	task := tr.Task()
   936  	alloc := tr.Alloc()
   937  	invocationid := uuid.Generate()[:8]
   938  	taskResources := tr.taskResources
   939  	env := tr.envBuilder.Build()
   940  	tr.networkIsolationLock.Lock()
   941  	defer tr.networkIsolationLock.Unlock()
   942  
   943  	return &drivers.TaskConfig{
   944  		ID:            fmt.Sprintf("%s/%s/%s", alloc.ID, task.Name, invocationid),
   945  		Name:          task.Name,
   946  		JobName:       alloc.Job.Name,
   947  		TaskGroupName: alloc.TaskGroup,
   948  		Resources: &drivers.Resources{
   949  			NomadResources: taskResources,
   950  			LinuxResources: &drivers.LinuxResources{
   951  				MemoryLimitBytes: taskResources.Memory.MemoryMB * 1024 * 1024,
   952  				CPUShares:        taskResources.Cpu.CpuShares,
   953  				PercentTicks:     float64(taskResources.Cpu.CpuShares) / float64(tr.clientConfig.Node.NodeResources.Cpu.CpuShares),
   954  			},
   955  		},
   956  		Devices:          tr.hookResources.getDevices(),
   957  		Mounts:           tr.hookResources.getMounts(),
   958  		Env:              env.Map(),
   959  		DeviceEnv:        env.DeviceEnv(),
   960  		User:             task.User,
   961  		AllocDir:         tr.taskDir.AllocDir,
   962  		StdoutPath:       tr.logmonHookConfig.stdoutFifo,
   963  		StderrPath:       tr.logmonHookConfig.stderrFifo,
   964  		AllocID:          tr.allocID,
   965  		NetworkIsolation: tr.networkIsolationSpec,
   966  	}
   967  }
   968  
   969  // Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner
   970  // but before Run so no locks need to be acquired.
   971  func (tr *TaskRunner) Restore() error {
   972  	ls, ts, err := tr.stateDB.GetTaskRunnerState(tr.allocID, tr.taskName)
   973  	if err != nil {
   974  		return err
   975  	}
   976  
   977  	if ls != nil {
   978  		ls.Canonicalize()
   979  		tr.localState = ls
   980  	}
   981  
   982  	if ts != nil {
   983  		ts.Canonicalize()
   984  		tr.state = ts
   985  	}
   986  
   987  	// If a TaskHandle was persisted, ensure it is valid or destroy it.
   988  	if taskHandle := tr.localState.TaskHandle; taskHandle != nil {
   989  		//TODO if RecoverTask returned the DriverNetwork we wouldn't
   990  		//     have to persist it at all!
   991  		restored := tr.restoreHandle(taskHandle, tr.localState.DriverNetwork)
   992  
   993  		// If the handle could not be restored, the alloc is
   994  		// non-terminal, and the task isn't a system job: wait until
   995  		// servers have been contacted before running. #1795
   996  		if restored {
   997  			return nil
   998  		}
   999  
  1000  		alloc := tr.Alloc()
  1001  		if tr.state.State == structs.TaskStateDead || alloc.TerminalStatus() || alloc.Job.Type == structs.JobTypeSystem {
  1002  			return nil
  1003  		}
  1004  
  1005  		tr.logger.Trace("failed to reattach to task; will not run until server is contacted")
  1006  		tr.waitOnServers = true
  1007  
  1008  		ev := structs.NewTaskEvent(structs.TaskRestoreFailed).
  1009  			SetDisplayMessage("failed to restore task; will not run until server is contacted")
  1010  		tr.UpdateState(structs.TaskStatePending, ev)
  1011  	}
  1012  
  1013  	return nil
  1014  }
  1015  
  1016  // restoreHandle ensures a TaskHandle is valid by calling Driver.RecoverTask
  1017  // and sets the driver handle. If the TaskHandle is not valid, DestroyTask is
  1018  // called.
  1019  func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers.DriverNetwork) (success bool) {
  1020  	// Ensure handle is well-formed
  1021  	if taskHandle.Config == nil {
  1022  		return true
  1023  	}
  1024  
  1025  	if err := tr.driver.RecoverTask(taskHandle); err != nil {
  1026  		if tr.TaskState().State != structs.TaskStateRunning {
  1027  			// RecoverTask should fail if the Task wasn't running
  1028  			return true
  1029  		}
  1030  
  1031  		tr.logger.Error("error recovering task; cleaning up",
  1032  			"error", err, "task_id", taskHandle.Config.ID)
  1033  
  1034  		// Try to cleanup any existing task state in the plugin before restarting
  1035  		if err := tr.driver.DestroyTask(taskHandle.Config.ID, true); err != nil {
  1036  			// Ignore ErrTaskNotFound errors as ideally
  1037  			// this task has already been stopped and
  1038  			// therefore doesn't exist.
  1039  			if err != drivers.ErrTaskNotFound {
  1040  				tr.logger.Warn("error destroying unrecoverable task",
  1041  					"error", err, "task_id", taskHandle.Config.ID)
  1042  			}
  1043  
  1044  			return false
  1045  		}
  1046  
  1047  		return true
  1048  	}
  1049  
  1050  	// Update driver handle on task runner
  1051  	tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net))
  1052  	return true
  1053  }
  1054  
  1055  // UpdateState sets the task runners allocation state and triggers a server
  1056  // update.
  1057  func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) {
  1058  	tr.stateLock.Lock()
  1059  	defer tr.stateLock.Unlock()
  1060  
  1061  	if event != nil {
  1062  		tr.logger.Trace("setting task state", "state", state, "event", event.Type)
  1063  
  1064  		// Append the event
  1065  		tr.appendEvent(event)
  1066  	}
  1067  
  1068  	// Update the state
  1069  	if err := tr.updateStateImpl(state); err != nil {
  1070  		// Only log the error as we persistence errors should not
  1071  		// affect task state.
  1072  		tr.logger.Error("error persisting task state", "error", err, "event", event, "state", state)
  1073  	}
  1074  
  1075  	// Notify the alloc runner of the transition
  1076  	tr.stateUpdater.TaskStateUpdated()
  1077  }
  1078  
  1079  // updateStateImpl updates the in-memory task state and persists to disk.
  1080  func (tr *TaskRunner) updateStateImpl(state string) error {
  1081  
  1082  	// Update the task state
  1083  	oldState := tr.state.State
  1084  	taskState := tr.state
  1085  	taskState.State = state
  1086  
  1087  	// Handle the state transition.
  1088  	switch state {
  1089  	case structs.TaskStateRunning:
  1090  		// Capture the start time if it is just starting
  1091  		if oldState != structs.TaskStateRunning {
  1092  			taskState.StartedAt = time.Now().UTC()
  1093  			if !tr.clientConfig.DisableTaggedMetrics {
  1094  				metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels)
  1095  			}
  1096  			//if r.config.BackwardsCompatibleMetrics {
  1097  			//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
  1098  			//}
  1099  		}
  1100  	case structs.TaskStateDead:
  1101  		// Capture the finished time if not already set
  1102  		if taskState.FinishedAt.IsZero() {
  1103  			taskState.FinishedAt = time.Now().UTC()
  1104  		}
  1105  
  1106  		// Emitting metrics to indicate task complete and failures
  1107  		if taskState.Failed {
  1108  			if !tr.clientConfig.DisableTaggedMetrics {
  1109  				metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels)
  1110  			}
  1111  			//if r.config.BackwardsCompatibleMetrics {
  1112  			//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
  1113  			//}
  1114  		} else {
  1115  			if !tr.clientConfig.DisableTaggedMetrics {
  1116  				metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, 1, tr.baseLabels)
  1117  			}
  1118  			//if r.config.BackwardsCompatibleMetrics {
  1119  			//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
  1120  			//}
  1121  		}
  1122  	}
  1123  
  1124  	// Persist the state and event
  1125  	return tr.stateDB.PutTaskState(tr.allocID, tr.taskName, taskState)
  1126  }
  1127  
  1128  // EmitEvent appends a new TaskEvent to this task's TaskState. The actual
  1129  // TaskState.State (pending, running, dead) is not changed. Use UpdateState to
  1130  // transition states.
  1131  // Events are persisted locally and sent to the server, but errors are simply
  1132  // logged. Use AppendEvent to simply add a new event.
  1133  func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) {
  1134  	tr.stateLock.Lock()
  1135  	defer tr.stateLock.Unlock()
  1136  
  1137  	tr.appendEvent(event)
  1138  
  1139  	if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil {
  1140  		// Only a warning because the next event/state-transition will
  1141  		// try to persist it again.
  1142  		tr.logger.Warn("error persisting event", "error", err, "event", event)
  1143  	}
  1144  
  1145  	// Notify the alloc runner of the event
  1146  	tr.stateUpdater.TaskStateUpdated()
  1147  }
  1148  
  1149  // AppendEvent appends a new TaskEvent to this task's TaskState. The actual
  1150  // TaskState.State (pending, running, dead) is not changed. Use UpdateState to
  1151  // transition states.
  1152  // Events are persisted locally and errors are simply logged. Use EmitEvent
  1153  // also update AllocRunner.
  1154  func (tr *TaskRunner) AppendEvent(event *structs.TaskEvent) {
  1155  	tr.stateLock.Lock()
  1156  	defer tr.stateLock.Unlock()
  1157  
  1158  	tr.appendEvent(event)
  1159  
  1160  	if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil {
  1161  		// Only a warning because the next event/state-transition will
  1162  		// try to persist it again.
  1163  		tr.logger.Warn("error persisting event", "error", err, "event", event)
  1164  	}
  1165  }
  1166  
  1167  // appendEvent to task's event slice. Caller must acquire stateLock.
  1168  func (tr *TaskRunner) appendEvent(event *structs.TaskEvent) error {
  1169  	// Ensure the event is populated with human readable strings
  1170  	event.PopulateEventDisplayMessage()
  1171  
  1172  	// Propagate failure from event to task state
  1173  	if event.FailsTask {
  1174  		tr.state.Failed = true
  1175  	}
  1176  
  1177  	// XXX This seems like a super awkward spot for this? Why not shouldRestart?
  1178  	// Update restart metrics
  1179  	if event.Type == structs.TaskRestarting {
  1180  		if !tr.clientConfig.DisableTaggedMetrics {
  1181  			metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels)
  1182  		}
  1183  		//if r.config.BackwardsCompatibleMetrics {
  1184  		//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
  1185  		//}
  1186  		tr.state.Restarts++
  1187  		tr.state.LastRestart = time.Unix(0, event.Time)
  1188  	}
  1189  
  1190  	// Append event to slice
  1191  	appendTaskEvent(tr.state, event, tr.maxEvents)
  1192  
  1193  	return nil
  1194  }
  1195  
  1196  // WaitCh is closed when TaskRunner.Run exits.
  1197  func (tr *TaskRunner) WaitCh() <-chan struct{} {
  1198  	return tr.waitCh
  1199  }
  1200  
  1201  // Update the running allocation with a new version received from the server.
  1202  // Calls Update hooks asynchronously with Run.
  1203  //
  1204  // This method is safe for calling concurrently with Run and does not modify
  1205  // the passed in allocation.
  1206  func (tr *TaskRunner) Update(update *structs.Allocation) {
  1207  	task := update.LookupTask(tr.taskName)
  1208  	if task == nil {
  1209  		// This should not happen and likely indicates a bug in the
  1210  		// server or client.
  1211  		tr.logger.Error("allocation update is missing task; killing",
  1212  			"group", update.TaskGroup)
  1213  		te := structs.NewTaskEvent(structs.TaskKilled).
  1214  			SetKillReason("update missing task").
  1215  			SetFailsTask()
  1216  		tr.Kill(context.Background(), te)
  1217  		return
  1218  	}
  1219  
  1220  	// Update tr.alloc
  1221  	tr.setAlloc(update, task)
  1222  
  1223  	// Trigger update hooks if not terminal
  1224  	if !update.TerminalStatus() {
  1225  		tr.triggerUpdateHooks()
  1226  	}
  1227  }
  1228  
  1229  // SetNetworkIsolation is called by the PreRun allocation hook after configuring
  1230  // the network isolation for the allocation
  1231  func (tr *TaskRunner) SetNetworkIsolation(n *drivers.NetworkIsolationSpec) {
  1232  	tr.networkIsolationLock.Lock()
  1233  	tr.networkIsolationSpec = n
  1234  	tr.networkIsolationLock.Unlock()
  1235  }
  1236  
  1237  // triggerUpdate if there isn't already an update pending. Should be called
  1238  // instead of calling updateHooks directly to serialize runs of update hooks.
  1239  // TaskRunner state should be updated prior to triggering update hooks.
  1240  //
  1241  // Does not block.
  1242  func (tr *TaskRunner) triggerUpdateHooks() {
  1243  	select {
  1244  	case tr.triggerUpdateCh <- struct{}{}:
  1245  	default:
  1246  		// already an update hook pending
  1247  	}
  1248  }
  1249  
  1250  // Shutdown TaskRunner gracefully without affecting the state of the task.
  1251  // Shutdown blocks until the main Run loop exits.
  1252  func (tr *TaskRunner) Shutdown() {
  1253  	tr.logger.Trace("shutting down")
  1254  	tr.shutdownCtxCancel()
  1255  
  1256  	<-tr.WaitCh()
  1257  
  1258  	// Run shutdown hooks to cleanup
  1259  	tr.shutdownHooks()
  1260  
  1261  	// Persist once more
  1262  	tr.persistLocalState()
  1263  }
  1264  
  1265  // LatestResourceUsage returns the last resource utilization datapoint
  1266  // collected. May return nil if the task is not running or no resource
  1267  // utilization has been collected yet.
  1268  func (tr *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1269  	tr.resourceUsageLock.Lock()
  1270  	ru := tr.resourceUsage
  1271  	tr.resourceUsageLock.Unlock()
  1272  
  1273  	// Look up device statistics lazily when fetched, as currently we do not emit any stats for them yet
  1274  	if ru != nil && tr.deviceStatsReporter != nil {
  1275  		deviceResources := tr.taskResources.Devices
  1276  		ru.ResourceUsage.DeviceStats = tr.deviceStatsReporter.LatestDeviceResourceStats(deviceResources)
  1277  	}
  1278  	return ru
  1279  }
  1280  
  1281  // UpdateStats updates and emits the latest stats from the driver.
  1282  func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) {
  1283  	tr.resourceUsageLock.Lock()
  1284  	tr.resourceUsage = ru
  1285  	tr.resourceUsageLock.Unlock()
  1286  	if ru != nil {
  1287  		tr.emitStats(ru)
  1288  	}
  1289  }
  1290  
  1291  //TODO Remove Backwardscompat or use tr.Alloc()?
  1292  func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) {
  1293  	alloc := tr.Alloc()
  1294  	var allocatedMem float32
  1295  	if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil {
  1296  		// Convert to bytes to match other memory metrics
  1297  		allocatedMem = float32(taskRes.Memory.MemoryMB) * 1024 * 1024
  1298  	}
  1299  
  1300  	if !tr.clientConfig.DisableTaggedMetrics {
  1301  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1302  			float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels)
  1303  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"},
  1304  			float32(ru.ResourceUsage.MemoryStats.Cache), tr.baseLabels)
  1305  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"},
  1306  			float32(ru.ResourceUsage.MemoryStats.Swap), tr.baseLabels)
  1307  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "usage"},
  1308  			float32(ru.ResourceUsage.MemoryStats.Usage), tr.baseLabels)
  1309  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"},
  1310  			float32(ru.ResourceUsage.MemoryStats.MaxUsage), tr.baseLabels)
  1311  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"},
  1312  			float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels)
  1313  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"},
  1314  			float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels)
  1315  		if allocatedMem > 0 {
  1316  			metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "allocated"},
  1317  				allocatedMem, tr.baseLabels)
  1318  		}
  1319  	}
  1320  
  1321  	if tr.clientConfig.BackwardsCompatibleMetrics {
  1322  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1323  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1324  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1325  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage))
  1326  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1327  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1328  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1329  		if allocatedMem > 0 {
  1330  			metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "allocated"}, allocatedMem)
  1331  		}
  1332  	}
  1333  }
  1334  
  1335  //TODO Remove Backwardscompat or use tr.Alloc()?
  1336  func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) {
  1337  	alloc := tr.Alloc()
  1338  	var allocatedCPU float32
  1339  	if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil {
  1340  		allocatedCPU = float32(taskRes.Cpu.CpuShares)
  1341  	}
  1342  
  1343  	if !tr.clientConfig.DisableTaggedMetrics {
  1344  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"},
  1345  			float32(ru.ResourceUsage.CpuStats.Percent), tr.baseLabels)
  1346  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"},
  1347  			float32(ru.ResourceUsage.CpuStats.SystemMode), tr.baseLabels)
  1348  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"},
  1349  			float32(ru.ResourceUsage.CpuStats.UserMode), tr.baseLabels)
  1350  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"},
  1351  			float32(ru.ResourceUsage.CpuStats.ThrottledTime), tr.baseLabels)
  1352  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"},
  1353  			float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels)
  1354  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"},
  1355  			float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels)
  1356  		if allocatedCPU > 0 {
  1357  			metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "allocated"},
  1358  				allocatedCPU, tr.baseLabels)
  1359  		}
  1360  	}
  1361  
  1362  	if tr.clientConfig.BackwardsCompatibleMetrics {
  1363  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1364  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1365  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1366  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1367  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1368  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1369  		if allocatedCPU > 0 {
  1370  			metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "allocated"}, allocatedCPU)
  1371  		}
  1372  	}
  1373  }
  1374  
  1375  // emitStats emits resource usage stats of tasks to remote metrics collector
  1376  // sinks
  1377  func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1378  	if !tr.clientConfig.PublishAllocationMetrics {
  1379  		return
  1380  	}
  1381  
  1382  	if ru.ResourceUsage.MemoryStats != nil {
  1383  		tr.setGaugeForMemory(ru)
  1384  	} else {
  1385  		tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil")
  1386  	}
  1387  
  1388  	if ru.ResourceUsage.CpuStats != nil {
  1389  		tr.setGaugeForCPU(ru)
  1390  	} else {
  1391  		tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil")
  1392  	}
  1393  }
  1394  
  1395  // appendTaskEvent updates the task status by appending the new event.
  1396  func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent, capacity int) {
  1397  	if state.Events == nil {
  1398  		state.Events = make([]*structs.TaskEvent, 1, capacity)
  1399  		state.Events[0] = event
  1400  		return
  1401  	}
  1402  
  1403  	// If we hit capacity, then shift it.
  1404  	if len(state.Events) == capacity {
  1405  		old := state.Events
  1406  		state.Events = make([]*structs.TaskEvent, 0, capacity)
  1407  		state.Events = append(state.Events, old[1:]...)
  1408  	}
  1409  
  1410  	state.Events = append(state.Events, event)
  1411  }
  1412  
  1413  func (tr *TaskRunner) TaskExecHandler() drivermanager.TaskExecHandler {
  1414  	// Check it is running
  1415  	handle := tr.getDriverHandle()
  1416  	if handle == nil {
  1417  		return nil
  1418  	}
  1419  	return handle.ExecStreaming
  1420  }
  1421  
  1422  func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) {
  1423  	return tr.driver.Capabilities()
  1424  }
  1425  
  1426  func (tr *TaskRunner) SetAllocHookResources(res *cstructs.AllocHookResources) {
  1427  	tr.allocHookResources = res
  1428  }