github.com/manicqin/nomad@v0.9.5/client/allocrunner/taskrunner/task_runner.go (about)

     1  package taskrunner
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	metrics "github.com/armon/go-metrics"
    12  	log "github.com/hashicorp/go-hclog"
    13  	multierror "github.com/hashicorp/go-multierror"
    14  	"github.com/hashicorp/hcl2/hcldec"
    15  	"github.com/hashicorp/nomad/client/allocdir"
    16  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    17  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts"
    18  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
    19  	"github.com/hashicorp/nomad/client/config"
    20  	"github.com/hashicorp/nomad/client/consul"
    21  	"github.com/hashicorp/nomad/client/devicemanager"
    22  	cinterfaces "github.com/hashicorp/nomad/client/interfaces"
    23  	"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
    24  	cstate "github.com/hashicorp/nomad/client/state"
    25  	cstructs "github.com/hashicorp/nomad/client/structs"
    26  	"github.com/hashicorp/nomad/client/taskenv"
    27  	"github.com/hashicorp/nomad/client/vaultclient"
    28  	"github.com/hashicorp/nomad/helper/pluginutils/hclspecutils"
    29  	"github.com/hashicorp/nomad/helper/pluginutils/hclutils"
    30  	"github.com/hashicorp/nomad/helper/uuid"
    31  	"github.com/hashicorp/nomad/nomad/structs"
    32  	bstructs "github.com/hashicorp/nomad/plugins/base/structs"
    33  	"github.com/hashicorp/nomad/plugins/drivers"
    34  )
    35  
    36  const (
    37  	// defaultMaxEvents is the default max capacity for task events on the
    38  	// task state. Overrideable for testing.
    39  	defaultMaxEvents = 10
    40  
    41  	// killBackoffBaseline is the baseline time for exponential backoff while
    42  	// killing a task.
    43  	killBackoffBaseline = 5 * time.Second
    44  
    45  	// killBackoffLimit is the limit of the exponential backoff for killing
    46  	// the task.
    47  	killBackoffLimit = 2 * time.Minute
    48  
    49  	// killFailureLimit is how many times we will attempt to kill a task before
    50  	// giving up and potentially leaking resources.
    51  	killFailureLimit = 5
    52  
    53  	// triggerUpdatechCap is the capacity for the triggerUpdateCh used for
    54  	// triggering updates. It should be exactly 1 as even if multiple
    55  	// updates have come in since the last one was handled, we only need to
    56  	// handle the last one.
    57  	triggerUpdateChCap = 1
    58  )
    59  
    60  type TaskRunner struct {
    61  	// allocID, taskName, taskLeader, and taskResources are immutable so these fields may
    62  	// be accessed without locks
    63  	allocID       string
    64  	taskName      string
    65  	taskLeader    bool
    66  	taskResources *structs.AllocatedTaskResources
    67  
    68  	alloc     *structs.Allocation
    69  	allocLock sync.Mutex
    70  
    71  	clientConfig *config.Config
    72  
    73  	// stateUpdater is used to emit updated task state
    74  	stateUpdater interfaces.TaskStateHandler
    75  
    76  	// state captures the state of the task for updating the allocation
    77  	// Must acquire stateLock to access.
    78  	state *structs.TaskState
    79  
    80  	// localState captures the node-local state of the task for when the
    81  	// Nomad agent restarts.
    82  	// Must acquire stateLock to access.
    83  	localState *state.LocalState
    84  
    85  	// stateLock must be acquired when accessing state or localState.
    86  	stateLock sync.RWMutex
    87  
    88  	// stateDB is for persisting localState and taskState
    89  	stateDB cstate.StateDB
    90  
    91  	// shutdownCtx is used to exit the TaskRunner *without* affecting task state.
    92  	shutdownCtx context.Context
    93  
    94  	// shutdownCtxCancel causes the TaskRunner to exit immediately without
    95  	// affecting task state. Useful for testing or graceful agent shutdown.
    96  	shutdownCtxCancel context.CancelFunc
    97  
    98  	// killCtx is the task runner's context representing the tasks's lifecycle.
    99  	// The context is canceled when the task is killed.
   100  	killCtx context.Context
   101  
   102  	// killCtxCancel is called when killing a task.
   103  	killCtxCancel context.CancelFunc
   104  
   105  	// killErr is populated when killing a task. Access should be done use the
   106  	// getter/setter
   107  	killErr     error
   108  	killErrLock sync.Mutex
   109  
   110  	// Logger is the logger for the task runner.
   111  	logger log.Logger
   112  
   113  	// triggerUpdateCh is ticked whenever update hooks need to be run and
   114  	// must be created with cap=1 to signal a pending update and prevent
   115  	// callers from deadlocking if the receiver has exited.
   116  	triggerUpdateCh chan struct{}
   117  
   118  	// waitCh is closed when the task runner has transitioned to a terminal
   119  	// state
   120  	waitCh chan struct{}
   121  
   122  	// driver is the driver for the task.
   123  	driver drivers.DriverPlugin
   124  
   125  	// driverCapabilities is the set capabilities the driver supports
   126  	driverCapabilities *drivers.Capabilities
   127  
   128  	// taskSchema is the hcl spec for the task driver configuration
   129  	taskSchema hcldec.Spec
   130  
   131  	// handleLock guards access to handle and handleResult
   132  	handleLock sync.Mutex
   133  
   134  	// handle to the running driver
   135  	handle *DriverHandle
   136  
   137  	// task is the task being run
   138  	task     *structs.Task
   139  	taskLock sync.RWMutex
   140  
   141  	// taskDir is the directory structure for this task.
   142  	taskDir *allocdir.TaskDir
   143  
   144  	// envBuilder is used to build the task's environment
   145  	envBuilder *taskenv.Builder
   146  
   147  	// restartTracker is used to decide if the task should be restarted.
   148  	restartTracker *restarts.RestartTracker
   149  
   150  	// runnerHooks are task runner lifecycle hooks that should be run on state
   151  	// transistions.
   152  	runnerHooks []interfaces.TaskHook
   153  
   154  	// hookResources captures the resources provided by hooks
   155  	hookResources *hookResources
   156  
   157  	// consulClient is the client used by the consul service hook for
   158  	// registering services and checks
   159  	consulClient consul.ConsulServiceAPI
   160  
   161  	// vaultClient is the client to use to derive and renew Vault tokens
   162  	vaultClient vaultclient.VaultClient
   163  
   164  	// vaultToken is the current Vault token. It should be accessed with the
   165  	// getter.
   166  	vaultToken     string
   167  	vaultTokenLock sync.Mutex
   168  
   169  	// baseLabels are used when emitting tagged metrics. All task runner metrics
   170  	// will have these tags, and optionally more.
   171  	baseLabels []metrics.Label
   172  
   173  	// logmonHookConfig is used to get the paths to the stdout and stderr fifos
   174  	// to be passed to the driver for task logging
   175  	logmonHookConfig *logmonHookConfig
   176  
   177  	// resourceUsage is written via UpdateStats and read via
   178  	// LatestResourceUsage. May be nil at all times.
   179  	resourceUsage     *cstructs.TaskResourceUsage
   180  	resourceUsageLock sync.Mutex
   181  
   182  	// deviceStatsReporter is used to lookup resource usage for alloc devices
   183  	deviceStatsReporter cinterfaces.DeviceStatsReporter
   184  
   185  	// devicemanager is used to mount devices as well as lookup device
   186  	// statistics
   187  	devicemanager devicemanager.Manager
   188  
   189  	// driverManager is used to dispense driver plugins and register event
   190  	// handlers
   191  	driverManager drivermanager.Manager
   192  
   193  	// maxEvents is the capacity of the TaskEvents on the TaskState.
   194  	// Defaults to defaultMaxEvents but overrideable for testing.
   195  	maxEvents int
   196  
   197  	// serversContactedCh is passed to TaskRunners so they can detect when
   198  	// GetClientAllocs has been called in case of a failed restore.
   199  	serversContactedCh <-chan struct{}
   200  
   201  	// waitOnServers defaults to false but will be set true if a restore
   202  	// fails and the Run method should wait until serversContactedCh is
   203  	// closed.
   204  	waitOnServers bool
   205  
   206  	networkIsolationLock sync.Mutex
   207  	networkIsolationSpec *drivers.NetworkIsolationSpec
   208  }
   209  
   210  type Config struct {
   211  	Alloc        *structs.Allocation
   212  	ClientConfig *config.Config
   213  	Consul       consul.ConsulServiceAPI
   214  	Task         *structs.Task
   215  	TaskDir      *allocdir.TaskDir
   216  	Logger       log.Logger
   217  
   218  	// Vault is the client to use to derive and renew Vault tokens
   219  	Vault vaultclient.VaultClient
   220  
   221  	// StateDB is used to store and restore state.
   222  	StateDB cstate.StateDB
   223  
   224  	// StateUpdater is used to emit updated task state
   225  	StateUpdater interfaces.TaskStateHandler
   226  
   227  	// deviceStatsReporter is used to lookup resource usage for alloc devices
   228  	DeviceStatsReporter cinterfaces.DeviceStatsReporter
   229  
   230  	// DeviceManager is used to mount devices as well as lookup device
   231  	// statistics
   232  	DeviceManager devicemanager.Manager
   233  
   234  	// DriverManager is used to dispense driver plugins and register event
   235  	// handlers
   236  	DriverManager drivermanager.Manager
   237  
   238  	// ServersContactedCh is closed when the first GetClientAllocs call to
   239  	// servers succeeds and allocs are synced.
   240  	ServersContactedCh chan struct{}
   241  }
   242  
   243  func NewTaskRunner(config *Config) (*TaskRunner, error) {
   244  	// Create a context for causing the runner to exit
   245  	trCtx, trCancel := context.WithCancel(context.Background())
   246  
   247  	// Create a context for killing the runner
   248  	killCtx, killCancel := context.WithCancel(context.Background())
   249  
   250  	// Initialize the environment builder
   251  	envBuilder := taskenv.NewBuilder(
   252  		config.ClientConfig.Node,
   253  		config.Alloc,
   254  		config.Task,
   255  		config.ClientConfig.Region,
   256  	)
   257  
   258  	// Initialize state from alloc if it is set
   259  	tstate := structs.NewTaskState()
   260  	if ts := config.Alloc.TaskStates[config.Task.Name]; ts != nil {
   261  		tstate = ts.Copy()
   262  	}
   263  
   264  	tr := &TaskRunner{
   265  		alloc:               config.Alloc,
   266  		allocID:             config.Alloc.ID,
   267  		clientConfig:        config.ClientConfig,
   268  		task:                config.Task,
   269  		taskDir:             config.TaskDir,
   270  		taskName:            config.Task.Name,
   271  		taskLeader:          config.Task.Leader,
   272  		envBuilder:          envBuilder,
   273  		consulClient:        config.Consul,
   274  		vaultClient:         config.Vault,
   275  		state:               tstate,
   276  		localState:          state.NewLocalState(),
   277  		stateDB:             config.StateDB,
   278  		stateUpdater:        config.StateUpdater,
   279  		deviceStatsReporter: config.DeviceStatsReporter,
   280  		killCtx:             killCtx,
   281  		killCtxCancel:       killCancel,
   282  		shutdownCtx:         trCtx,
   283  		shutdownCtxCancel:   trCancel,
   284  		triggerUpdateCh:     make(chan struct{}, triggerUpdateChCap),
   285  		waitCh:              make(chan struct{}),
   286  		devicemanager:       config.DeviceManager,
   287  		driverManager:       config.DriverManager,
   288  		maxEvents:           defaultMaxEvents,
   289  		serversContactedCh:  config.ServersContactedCh,
   290  	}
   291  
   292  	// Create the logger based on the allocation ID
   293  	tr.logger = config.Logger.Named("task_runner").With("task", config.Task.Name)
   294  
   295  	// Pull out the task's resources
   296  	ares := tr.alloc.AllocatedResources
   297  	if ares != nil {
   298  		tres, ok := ares.Tasks[tr.taskName]
   299  		if !ok {
   300  			return nil, fmt.Errorf("no task resources found on allocation")
   301  		}
   302  		tr.taskResources = tres
   303  	} else {
   304  		// COMPAT(0.11): Upgrade from 0.8 resources to 0.9+ resources
   305  		// Grab the old task resources
   306  		oldTr, ok := tr.alloc.TaskResources[tr.taskName]
   307  		if !ok {
   308  			return nil, fmt.Errorf("no task resources found on allocation")
   309  		}
   310  
   311  		// Convert the old to new
   312  		tr.taskResources = &structs.AllocatedTaskResources{
   313  			Cpu: structs.AllocatedCpuResources{
   314  				CpuShares: int64(oldTr.CPU),
   315  			},
   316  			Memory: structs.AllocatedMemoryResources{
   317  				MemoryMB: int64(oldTr.MemoryMB),
   318  			},
   319  			Networks: oldTr.Networks,
   320  		}
   321  	}
   322  
   323  	// Build the restart tracker.
   324  	tg := tr.alloc.Job.LookupTaskGroup(tr.alloc.TaskGroup)
   325  	if tg == nil {
   326  		tr.logger.Error("alloc missing task group")
   327  		return nil, fmt.Errorf("alloc missing task group")
   328  	}
   329  	tr.restartTracker = restarts.NewRestartTracker(tg.RestartPolicy, tr.alloc.Job.Type)
   330  
   331  	// Get the driver
   332  	if err := tr.initDriver(); err != nil {
   333  		tr.logger.Error("failed to create driver", "error", err)
   334  		return nil, err
   335  	}
   336  
   337  	// Initialize the runners hooks.
   338  	tr.initHooks()
   339  
   340  	// Initialize base labels
   341  	tr.initLabels()
   342  
   343  	// Initialize initial task received event
   344  	tr.appendEvent(structs.NewTaskEvent(structs.TaskReceived))
   345  
   346  	return tr, nil
   347  }
   348  
   349  func (tr *TaskRunner) initLabels() {
   350  	alloc := tr.Alloc()
   351  	tr.baseLabels = []metrics.Label{
   352  		{
   353  			Name:  "job_name",
   354  			Value: alloc.Job.Name,
   355  		},
   356  		{
   357  			Name:  "task_group",
   358  			Value: alloc.TaskGroup,
   359  		},
   360  		{
   361  			Name:  "alloc_id",
   362  			Value: tr.allocID,
   363  		},
   364  		{
   365  			Name:  "task",
   366  			Value: tr.taskName,
   367  		},
   368  		{
   369  			Name:  "namespace",
   370  			Value: tr.alloc.Namespace,
   371  		},
   372  	}
   373  
   374  	if tr.alloc.Job.ParentID != "" {
   375  		tr.baseLabels = append(tr.baseLabels, metrics.Label{
   376  			Name:  "parent_id",
   377  			Value: tr.alloc.Job.ParentID,
   378  		})
   379  		if strings.Contains(tr.alloc.Job.Name, "/dispatch-") {
   380  			tr.baseLabels = append(tr.baseLabels, metrics.Label{
   381  				Name:  "dispatch_id",
   382  				Value: strings.Split(tr.alloc.Job.Name, "/dispatch-")[1],
   383  			})
   384  		}
   385  		if strings.Contains(tr.alloc.Job.Name, "/periodic-") {
   386  			tr.baseLabels = append(tr.baseLabels, metrics.Label{
   387  				Name:  "periodic_id",
   388  				Value: strings.Split(tr.alloc.Job.Name, "/periodic-")[1],
   389  			})
   390  		}
   391  	}
   392  }
   393  
   394  // Mark a task as failed and not to run.  Aimed to be invoked when alloc runner
   395  // prestart hooks failed.
   396  // Should never be called with Run().
   397  func (tr *TaskRunner) MarkFailedDead(reason string) {
   398  	defer close(tr.waitCh)
   399  
   400  	tr.stateLock.Lock()
   401  	if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
   402  		//TODO Nomad will be unable to restore this task; try to kill
   403  		//     it now and fail? In general we prefer to leave running
   404  		//     tasks running even if the agent encounters an error.
   405  		tr.logger.Warn("error persisting local failed task state; may be unable to restore after a Nomad restart",
   406  			"error", err)
   407  	}
   408  	tr.stateLock.Unlock()
   409  
   410  	event := structs.NewTaskEvent(structs.TaskSetupFailure).
   411  		SetDisplayMessage(reason).
   412  		SetFailsTask()
   413  	tr.UpdateState(structs.TaskStateDead, event)
   414  
   415  	// Run the stop hooks in case task was a restored task that failed prestart
   416  	if err := tr.stop(); err != nil {
   417  		tr.logger.Error("stop failed while marking task dead", "error", err)
   418  	}
   419  }
   420  
   421  // Run the TaskRunner. Starts the user's task or reattaches to a restored task.
   422  // Run closes WaitCh when it exits. Should be started in a goroutine.
   423  func (tr *TaskRunner) Run() {
   424  	defer close(tr.waitCh)
   425  	var result *drivers.ExitResult
   426  
   427  	tr.stateLock.RLock()
   428  	dead := tr.state.State == structs.TaskStateDead
   429  	tr.stateLock.RUnlock()
   430  
   431  	// if restoring a dead task, ensure that task is cleared and all post hooks
   432  	// are called without additional state updates
   433  	if dead {
   434  		// do cleanup functions without emitting any additional events/work
   435  		// to handle cases where we restored a dead task where client terminated
   436  		// after task finished before completing post-run actions.
   437  		tr.clearDriverHandle()
   438  		tr.stateUpdater.TaskStateUpdated()
   439  		if err := tr.stop(); err != nil {
   440  			tr.logger.Error("stop failed on terminal task", "error", err)
   441  		}
   442  		return
   443  	}
   444  
   445  	// Updates are handled asynchronously with the other hooks but each
   446  	// triggered update - whether due to alloc updates or a new vault token
   447  	// - should be handled serially.
   448  	go tr.handleUpdates()
   449  
   450  	// If restore failed wait until servers are contacted before running.
   451  	// #1795
   452  	if tr.waitOnServers {
   453  		tr.logger.Info("task failed to restore; waiting to contact server before restarting")
   454  		select {
   455  		case <-tr.killCtx.Done():
   456  		case <-tr.shutdownCtx.Done():
   457  			return
   458  		case <-tr.serversContactedCh:
   459  			tr.logger.Info("server contacted; unblocking waiting task")
   460  		}
   461  	}
   462  
   463  MAIN:
   464  	for !tr.Alloc().TerminalStatus() {
   465  		select {
   466  		case <-tr.killCtx.Done():
   467  			break MAIN
   468  		case <-tr.shutdownCtx.Done():
   469  			// TaskRunner was told to exit immediately
   470  			return
   471  		default:
   472  		}
   473  
   474  		// Run the prestart hooks
   475  		if err := tr.prestart(); err != nil {
   476  			tr.logger.Error("prestart failed", "error", err)
   477  			tr.restartTracker.SetStartError(err)
   478  			goto RESTART
   479  		}
   480  
   481  		select {
   482  		case <-tr.killCtx.Done():
   483  			break MAIN
   484  		case <-tr.shutdownCtx.Done():
   485  			// TaskRunner was told to exit immediately
   486  			return
   487  		default:
   488  		}
   489  
   490  		// Run the task
   491  		if err := tr.runDriver(); err != nil {
   492  			tr.logger.Error("running driver failed", "error", err)
   493  			tr.restartTracker.SetStartError(err)
   494  			goto RESTART
   495  		}
   496  
   497  		// Run the poststart hooks
   498  		if err := tr.poststart(); err != nil {
   499  			tr.logger.Error("poststart failed", "error", err)
   500  		}
   501  
   502  		// Grab the result proxy and wait for task to exit
   503  	WAIT:
   504  		{
   505  			handle := tr.getDriverHandle()
   506  			result = nil
   507  
   508  			// Do *not* use tr.killCtx here as it would cause
   509  			// Wait() to unblock before the task exits when Kill()
   510  			// is called.
   511  			if resultCh, err := handle.WaitCh(context.Background()); err != nil {
   512  				tr.logger.Error("wait task failed", "error", err)
   513  				// Set a timer if timeout was specified, and add a new case if the timer elapsed
   514  			} else if tr.task.Timeout > 0 {
   515  				timer := time.NewTimer(tr.task.Timeout * time.Second)
   516  				select {
   517  				case <-tr.killCtx.Done():
   518  					// We can go through the normal should restart check since
   519  					// the restart tracker knowns it is killed
   520  					result = tr.handleKill()
   521  				case <-tr.shutdownCtx.Done():
   522  					// TaskRunner was told to exit immediately
   523  					return
   524  				case <-timer.C:
   525  					result = tr.handleTimeout()
   526  				case result = <-resultCh:
   527  				}
   528  
   529  				// WaitCh returned a result
   530  				if retryWait := tr.handleTaskExitResult(result); retryWait {
   531  					goto WAIT
   532  				}
   533  			} else {
   534  				select {
   535  				case <-tr.killCtx.Done():
   536  					// We can go through the normal should restart check since
   537  					// the restart tracker knowns it is killed
   538  					result = tr.handleKill()
   539  				case <-tr.shutdownCtx.Done():
   540  					// TaskRunner was told to exit immediately
   541  					return
   542  				case result = <-resultCh:
   543  				}
   544  
   545  				// WaitCh returned a result
   546  				if retryWait := tr.handleTaskExitResult(result); retryWait {
   547  					goto WAIT
   548  				}
   549  			}
   550  		}
   551  
   552  		// Clear the handle
   553  		tr.clearDriverHandle()
   554  
   555  		// Store the wait result on the restart tracker
   556  		tr.restartTracker.SetExitResult(result)
   557  
   558  		if err := tr.exited(); err != nil {
   559  			tr.logger.Error("exited hooks failed", "error", err)
   560  		}
   561  
   562  	RESTART:
   563  		restart, restartDelay := tr.shouldRestart()
   564  		if !restart {
   565  			break MAIN
   566  		}
   567  
   568  		// Actually restart by sleeping and also watching for destroy events
   569  		select {
   570  		case <-time.After(restartDelay):
   571  		case <-tr.killCtx.Done():
   572  			tr.logger.Trace("task killed between restarts", "delay", restartDelay)
   573  			break MAIN
   574  		case <-tr.shutdownCtx.Done():
   575  			// TaskRunner was told to exit immediately
   576  			tr.logger.Trace("gracefully shutting down during restart delay")
   577  			return
   578  		}
   579  	}
   580  
   581  	// Ensure handle is cleaned up. Restore could have recovered a task
   582  	// that should be terminal, so if the handle still exists we should
   583  	// kill it here.
   584  	if tr.getDriverHandle() != nil {
   585  		if result = tr.handleKill(); result != nil {
   586  			tr.emitExitResultEvent(result)
   587  		}
   588  
   589  		tr.clearDriverHandle()
   590  
   591  		if err := tr.exited(); err != nil {
   592  			tr.logger.Error("exited hooks failed while cleaning up terminal task", "error", err)
   593  		}
   594  	}
   595  
   596  	// Mark the task as dead
   597  	tr.UpdateState(structs.TaskStateDead, nil)
   598  
   599  	// Run the stop hooks
   600  	if err := tr.stop(); err != nil {
   601  		tr.logger.Error("stop failed", "error", err)
   602  	}
   603  
   604  	tr.logger.Debug("task run loop exiting")
   605  }
   606  
   607  // handleTaskExitResult handles the results returned by the task exiting. If
   608  // retryWait is true, the caller should attempt to wait on the task again since
   609  // it has not actually finished running. This can happen if the driver plugin
   610  // has exited.
   611  func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWait bool) {
   612  	if result == nil {
   613  		return false
   614  	}
   615  
   616  	if result.Err == bstructs.ErrPluginShutdown {
   617  		dn := tr.Task().Driver
   618  		tr.logger.Debug("driver plugin has shutdown; attempting to recover task", "driver", dn)
   619  
   620  		// Initialize a new driver handle
   621  		if err := tr.initDriver(); err != nil {
   622  			tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err, "driver", dn)
   623  			return false
   624  		}
   625  
   626  		// Try to restore the handle
   627  		tr.stateLock.RLock()
   628  		h := tr.localState.TaskHandle
   629  		net := tr.localState.DriverNetwork
   630  		tr.stateLock.RUnlock()
   631  		if !tr.restoreHandle(h, net) {
   632  			tr.logger.Error("failed to restore handle on driver after it exited unexpectedly", "driver", dn)
   633  			return false
   634  		}
   635  
   636  		tr.logger.Debug("task successfully recovered on driver", "driver", dn)
   637  		return true
   638  	}
   639  
   640  	// Emit Terminated event
   641  	tr.emitExitResultEvent(result)
   642  
   643  	return false
   644  }
   645  
   646  // emitExitResultEvent emits a TaskTerminated event for an ExitResult.
   647  func (tr *TaskRunner) emitExitResultEvent(result *drivers.ExitResult) {
   648  	event := structs.NewTaskEvent(structs.TaskTerminated).
   649  		SetExitCode(result.ExitCode).
   650  		SetSignal(result.Signal).
   651  		SetOOMKilled(result.OOMKilled).
   652  		SetExitMessage(result.Err).
   653  		SetTimeout(result.TimedOut)
   654  
   655  	tr.EmitEvent(event)
   656  
   657  	if result.OOMKilled && !tr.clientConfig.DisableTaggedMetrics {
   658  		metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels)
   659  	}
   660  }
   661  
   662  // handleUpdates runs update hooks when triggerUpdateCh is ticked and exits
   663  // when Run has returned. Should only be run in a goroutine from Run.
   664  func (tr *TaskRunner) handleUpdates() {
   665  	for {
   666  		select {
   667  		case <-tr.triggerUpdateCh:
   668  		case <-tr.waitCh:
   669  			return
   670  		}
   671  
   672  		// Non-terminal update; run hooks
   673  		tr.updateHooks()
   674  	}
   675  }
   676  
   677  // shouldRestart determines whether the task should be restarted and updates
   678  // the task state unless the task is killed or terminated.
   679  func (tr *TaskRunner) shouldRestart() (bool, time.Duration) {
   680  	// Determine if we should restart
   681  	state, when := tr.restartTracker.GetState()
   682  	reason := tr.restartTracker.GetReason()
   683  	switch state {
   684  	case structs.TaskKilled:
   685  		// Never restart an explicitly killed task. Kill method handles
   686  		// updating the server.
   687  		tr.EmitEvent(structs.NewTaskEvent(state))
   688  		return false, 0
   689  	case structs.TaskNotRestarting, structs.TaskTerminated:
   690  		tr.logger.Info("not restarting task", "reason", reason)
   691  		if state == structs.TaskNotRestarting {
   692  			tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting).SetRestartReason(reason).SetFailsTask())
   693  		}
   694  		return false, 0
   695  	case structs.TaskRestarting:
   696  		tr.logger.Info("restarting task", "reason", reason, "delay", when)
   697  		tr.UpdateState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when).SetRestartReason(reason))
   698  		return true, when
   699  	default:
   700  		tr.logger.Error("restart tracker returned unknown state", "state", state)
   701  		return true, when
   702  	}
   703  }
   704  
   705  // runDriver runs the driver and waits for it to exit
   706  // runDriver emits an appropriate task event on success/failure
   707  func (tr *TaskRunner) runDriver() error {
   708  
   709  	taskConfig := tr.buildTaskConfig()
   710  
   711  	// Build hcl context variables
   712  	vars, errs, err := tr.envBuilder.Build().AllValues()
   713  	if err != nil {
   714  		return fmt.Errorf("error building environment variables: %v", err)
   715  	}
   716  
   717  	// Handle per-key errors
   718  	if len(errs) > 0 {
   719  		keys := make([]string, 0, len(errs))
   720  		for k, err := range errs {
   721  			keys = append(keys, k)
   722  
   723  			if tr.logger.IsTrace() {
   724  				// Verbosely log every diagnostic for debugging
   725  				tr.logger.Trace("error building environment variables", "key", k, "error", err)
   726  			}
   727  		}
   728  
   729  		tr.logger.Warn("some environment variables not available for rendering", "keys", strings.Join(keys, ", "))
   730  	}
   731  
   732  	val, diag, diagErrs := hclutils.ParseHclInterface(tr.task.Config, tr.taskSchema, vars)
   733  	if diag.HasErrors() {
   734  		parseErr := multierror.Append(errors.New("failed to parse config: "), diagErrs...)
   735  		tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(parseErr))
   736  		return parseErr
   737  	}
   738  
   739  	if err := taskConfig.EncodeDriverConfig(val); err != nil {
   740  		encodeErr := fmt.Errorf("failed to encode driver config: %v", err)
   741  		tr.EmitEvent(structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(encodeErr))
   742  		return encodeErr
   743  	}
   744  
   745  	// If there's already a task handle (eg from a Restore) there's nothing
   746  	// to do except update state.
   747  	if tr.getDriverHandle() != nil {
   748  		// Ensure running state is persisted but do *not* append a new
   749  		// task event as restoring is a client event and not relevant
   750  		// to a task's lifecycle.
   751  		if err := tr.updateStateImpl(structs.TaskStateRunning); err != nil {
   752  			//TODO return error and destroy task to avoid an orphaned task?
   753  			tr.logger.Warn("error persisting task state", "error", err)
   754  		}
   755  		return nil
   756  	}
   757  
   758  	// Start the job if there's no existing handle (or if RecoverTask failed)
   759  	handle, net, err := tr.driver.StartTask(taskConfig)
   760  	if err != nil {
   761  		// The plugin has died, try relaunching it
   762  		if err == bstructs.ErrPluginShutdown {
   763  			tr.logger.Info("failed to start task because plugin shutdown unexpectedly; attempting to recover")
   764  			if err := tr.initDriver(); err != nil {
   765  				taskErr := fmt.Errorf("failed to initialize driver after it exited unexpectedly: %v", err)
   766  				tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr))
   767  				return taskErr
   768  			}
   769  
   770  			handle, net, err = tr.driver.StartTask(taskConfig)
   771  			if err != nil {
   772  				taskErr := fmt.Errorf("failed to start task after driver exited unexpectedly: %v", err)
   773  				tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(taskErr))
   774  				return taskErr
   775  			}
   776  		} else {
   777  			// Do *NOT* wrap the error here without maintaining whether or not is Recoverable.
   778  			// You must emit a task event failure to be considered Recoverable
   779  			tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err))
   780  			return err
   781  		}
   782  	}
   783  
   784  	tr.stateLock.Lock()
   785  	tr.localState.TaskHandle = handle
   786  	tr.localState.DriverNetwork = net
   787  	if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
   788  		//TODO Nomad will be unable to restore this task; try to kill
   789  		//     it now and fail? In general we prefer to leave running
   790  		//     tasks running even if the agent encounters an error.
   791  		tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart",
   792  			"error", err, "task_id", handle.Config.ID)
   793  	}
   794  	tr.stateLock.Unlock()
   795  
   796  	tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net))
   797  
   798  	// Emit an event that we started
   799  	tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
   800  	return nil
   801  }
   802  
   803  // initDriver retrives the DriverPlugin from the plugin loader for this task
   804  func (tr *TaskRunner) initDriver() error {
   805  	driver, err := tr.driverManager.Dispense(tr.Task().Driver)
   806  	if err != nil {
   807  		return err
   808  	}
   809  	tr.driver = driver
   810  
   811  	schema, err := tr.driver.TaskConfigSchema()
   812  	if err != nil {
   813  		return err
   814  	}
   815  	spec, diag := hclspecutils.Convert(schema)
   816  	if diag.HasErrors() {
   817  		return multierror.Append(errors.New("failed to convert task schema"), diag.Errs()...)
   818  	}
   819  	tr.taskSchema = spec
   820  
   821  	caps, err := tr.driver.Capabilities()
   822  	if err != nil {
   823  		return err
   824  	}
   825  	tr.driverCapabilities = caps
   826  
   827  	return nil
   828  }
   829  
   830  func (tr *TaskRunner) handleTimeout() *drivers.ExitResult {
   831  	event := structs.NewTaskEvent(structs.TaskKilling).
   832  		SetKillReason("Timeout")
   833  
   834  	tr.EmitEvent(event)
   835  	result := tr.handleKill()
   836  	result.ExitCode = 1
   837  	result.TimedOut = true
   838  
   839  	return result
   840  }
   841  
   842  // handleKill is used to handle the a request to kill a task. It will return
   843  // the handle exit result if one is available and store any error in the task
   844  // runner killErr value.
   845  func (tr *TaskRunner) handleKill() *drivers.ExitResult {
   846  	// Run the pre killing hooks
   847  	tr.preKill()
   848  
   849  	// Tell the restart tracker that the task has been killed so it doesn't
   850  	// attempt to restart it.
   851  	tr.restartTracker.SetKilled()
   852  
   853  	// Check it is running
   854  	handle := tr.getDriverHandle()
   855  	if handle == nil {
   856  		return nil
   857  	}
   858  
   859  	// Kill the task using an exponential backoff in-case of failures.
   860  	killErr := tr.killTask(handle)
   861  	if killErr != nil {
   862  		// We couldn't successfully destroy the resource created.
   863  		tr.logger.Error("failed to kill task. Resources may have been leaked", "error", killErr)
   864  		tr.setKillErr(killErr)
   865  	}
   866  
   867  	// Block until task has exited.
   868  	waitCh, err := handle.WaitCh(tr.shutdownCtx)
   869  
   870  	// The error should be nil or TaskNotFound, if it's something else then a
   871  	// failure in the driver or transport layer occurred
   872  	if err != nil {
   873  		if err == drivers.ErrTaskNotFound {
   874  			return nil
   875  		}
   876  		tr.logger.Error("failed to wait on task. Resources may have been leaked", "error", err)
   877  		tr.setKillErr(killErr)
   878  		return nil
   879  	}
   880  
   881  	select {
   882  	case result := <-waitCh:
   883  		return result
   884  	case <-tr.shutdownCtx.Done():
   885  		return nil
   886  	}
   887  }
   888  
   889  // killTask kills the task handle. In the case that killing fails,
   890  // killTask will retry with an exponential backoff and will give up at a
   891  // given limit. Returns an error if the task could not be killed.
   892  func (tr *TaskRunner) killTask(handle *DriverHandle) error {
   893  	// Cap the number of times we attempt to kill the task.
   894  	var err error
   895  	for i := 0; i < killFailureLimit; i++ {
   896  		if err = handle.Kill(); err != nil {
   897  			if err == drivers.ErrTaskNotFound {
   898  				tr.logger.Warn("couldn't find task to kill", "task_id", handle.ID())
   899  				return nil
   900  			}
   901  			// Calculate the new backoff
   902  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
   903  			if backoff > killBackoffLimit {
   904  				backoff = killBackoffLimit
   905  			}
   906  
   907  			tr.logger.Error("failed to kill task", "backoff", backoff, "error", err)
   908  			time.Sleep(backoff)
   909  		} else {
   910  			// Kill was successful
   911  			return nil
   912  		}
   913  	}
   914  	return err
   915  }
   916  
   917  // persistLocalState persists local state to disk synchronously.
   918  func (tr *TaskRunner) persistLocalState() error {
   919  	tr.stateLock.RLock()
   920  	defer tr.stateLock.RUnlock()
   921  
   922  	return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState)
   923  }
   924  
   925  // buildTaskConfig builds a drivers.TaskConfig with an unique ID for the task.
   926  // The ID is unique for every invocation, it is built from the alloc ID, task
   927  // name and 8 random characters.
   928  func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig {
   929  	task := tr.Task()
   930  	alloc := tr.Alloc()
   931  	invocationid := uuid.Generate()[:8]
   932  	taskResources := tr.taskResources
   933  	env := tr.envBuilder.Build()
   934  	tr.networkIsolationLock.Lock()
   935  	defer tr.networkIsolationLock.Unlock()
   936  
   937  	return &drivers.TaskConfig{
   938  		ID:            fmt.Sprintf("%s/%s/%s", alloc.ID, task.Name, invocationid),
   939  		Name:          task.Name,
   940  		JobName:       alloc.Job.Name,
   941  		TaskGroupName: alloc.TaskGroup,
   942  		Resources: &drivers.Resources{
   943  			NomadResources: taskResources,
   944  			LinuxResources: &drivers.LinuxResources{
   945  				MemoryLimitBytes: taskResources.Memory.MemoryMB * 1024 * 1024,
   946  				CPUShares:        taskResources.Cpu.CpuShares,
   947  				PercentTicks:     float64(taskResources.Cpu.CpuShares) / float64(tr.clientConfig.Node.NodeResources.Cpu.CpuShares),
   948  			},
   949  		},
   950  		Devices:          tr.hookResources.getDevices(),
   951  		Mounts:           tr.hookResources.getMounts(),
   952  		Env:              env.Map(),
   953  		DeviceEnv:        env.DeviceEnv(),
   954  		User:             task.User,
   955  		AllocDir:         tr.taskDir.AllocDir,
   956  		StdoutPath:       tr.logmonHookConfig.stdoutFifo,
   957  		StderrPath:       tr.logmonHookConfig.stderrFifo,
   958  		AllocID:          tr.allocID,
   959  		NetworkIsolation: tr.networkIsolationSpec,
   960  	}
   961  }
   962  
   963  // Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner
   964  // but before Run so no locks need to be acquired.
   965  func (tr *TaskRunner) Restore() error {
   966  	ls, ts, err := tr.stateDB.GetTaskRunnerState(tr.allocID, tr.taskName)
   967  	if err != nil {
   968  		return err
   969  	}
   970  
   971  	if ls != nil {
   972  		ls.Canonicalize()
   973  		tr.localState = ls
   974  	}
   975  
   976  	if ts != nil {
   977  		ts.Canonicalize()
   978  		tr.state = ts
   979  	}
   980  
   981  	// If a TaskHandle was persisted, ensure it is valid or destroy it.
   982  	if taskHandle := tr.localState.TaskHandle; taskHandle != nil {
   983  		//TODO if RecoverTask returned the DriverNetwork we wouldn't
   984  		//     have to persist it at all!
   985  		restored := tr.restoreHandle(taskHandle, tr.localState.DriverNetwork)
   986  
   987  		// If the handle could not be restored, the alloc is
   988  		// non-terminal, and the task isn't a system job: wait until
   989  		// servers have been contacted before running. #1795
   990  		if restored {
   991  			return nil
   992  		}
   993  
   994  		alloc := tr.Alloc()
   995  		if tr.state.State == structs.TaskStateDead || alloc.TerminalStatus() || alloc.Job.Type == structs.JobTypeSystem {
   996  			return nil
   997  		}
   998  
   999  		tr.logger.Trace("failed to reattach to task; will not run until server is contacted")
  1000  		tr.waitOnServers = true
  1001  
  1002  		ev := structs.NewTaskEvent(structs.TaskRestoreFailed).
  1003  			SetDisplayMessage("failed to restore task; will not run until server is contacted")
  1004  		tr.UpdateState(structs.TaskStatePending, ev)
  1005  	}
  1006  
  1007  	return nil
  1008  }
  1009  
  1010  // restoreHandle ensures a TaskHandle is valid by calling Driver.RecoverTask
  1011  // and sets the driver handle. If the TaskHandle is not valid, DestroyTask is
  1012  // called.
  1013  func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers.DriverNetwork) (success bool) {
  1014  	// Ensure handle is well-formed
  1015  	if taskHandle.Config == nil {
  1016  		return true
  1017  	}
  1018  
  1019  	if err := tr.driver.RecoverTask(taskHandle); err != nil {
  1020  		if tr.TaskState().State != structs.TaskStateRunning {
  1021  			// RecoverTask should fail if the Task wasn't running
  1022  			return true
  1023  		}
  1024  
  1025  		tr.logger.Error("error recovering task; cleaning up",
  1026  			"error", err, "task_id", taskHandle.Config.ID)
  1027  
  1028  		// Try to cleanup any existing task state in the plugin before restarting
  1029  		if err := tr.driver.DestroyTask(taskHandle.Config.ID, true); err != nil {
  1030  			// Ignore ErrTaskNotFound errors as ideally
  1031  			// this task has already been stopped and
  1032  			// therefore doesn't exist.
  1033  			if err != drivers.ErrTaskNotFound {
  1034  				tr.logger.Warn("error destroying unrecoverable task",
  1035  					"error", err, "task_id", taskHandle.Config.ID)
  1036  			}
  1037  
  1038  			return false
  1039  		}
  1040  
  1041  		return true
  1042  	}
  1043  
  1044  	// Update driver handle on task runner
  1045  	tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net))
  1046  	return true
  1047  }
  1048  
  1049  // UpdateState sets the task runners allocation state and triggers a server
  1050  // update.
  1051  func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) {
  1052  	tr.stateLock.Lock()
  1053  	defer tr.stateLock.Unlock()
  1054  
  1055  	if event != nil {
  1056  		tr.logger.Trace("setting task state", "state", state, "event", event.Type)
  1057  
  1058  		// Append the event
  1059  		tr.appendEvent(event)
  1060  	}
  1061  
  1062  	// Update the state
  1063  	if err := tr.updateStateImpl(state); err != nil {
  1064  		// Only log the error as we persistence errors should not
  1065  		// affect task state.
  1066  		tr.logger.Error("error persisting task state", "error", err, "event", event, "state", state)
  1067  	}
  1068  
  1069  	// Notify the alloc runner of the transition
  1070  	tr.stateUpdater.TaskStateUpdated()
  1071  }
  1072  
  1073  // updateStateImpl updates the in-memory task state and persists to disk.
  1074  func (tr *TaskRunner) updateStateImpl(state string) error {
  1075  
  1076  	// Update the task state
  1077  	oldState := tr.state.State
  1078  	taskState := tr.state
  1079  	taskState.State = state
  1080  
  1081  	// Handle the state transition.
  1082  	switch state {
  1083  	case structs.TaskStateRunning:
  1084  		// Capture the start time if it is just starting
  1085  		if oldState != structs.TaskStateRunning {
  1086  			taskState.StartedAt = time.Now().UTC()
  1087  			if !tr.clientConfig.DisableTaggedMetrics {
  1088  				metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels)
  1089  			}
  1090  			//if r.config.BackwardsCompatibleMetrics {
  1091  			//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
  1092  			//}
  1093  		}
  1094  	case structs.TaskStateDead:
  1095  		// Capture the finished time if not already set
  1096  		if taskState.FinishedAt.IsZero() {
  1097  			taskState.FinishedAt = time.Now().UTC()
  1098  		}
  1099  
  1100  		// Emitting metrics to indicate task complete and failures
  1101  		if taskState.Failed {
  1102  			if !tr.clientConfig.DisableTaggedMetrics {
  1103  				metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels)
  1104  			}
  1105  			//if r.config.BackwardsCompatibleMetrics {
  1106  			//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
  1107  			//}
  1108  		} else {
  1109  			if !tr.clientConfig.DisableTaggedMetrics {
  1110  				metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"}, 1, tr.baseLabels)
  1111  			}
  1112  			//if r.config.BackwardsCompatibleMetrics {
  1113  			//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
  1114  			//}
  1115  		}
  1116  	}
  1117  
  1118  	// Persist the state and event
  1119  	return tr.stateDB.PutTaskState(tr.allocID, tr.taskName, taskState)
  1120  }
  1121  
  1122  // EmitEvent appends a new TaskEvent to this task's TaskState. The actual
  1123  // TaskState.State (pending, running, dead) is not changed. Use UpdateState to
  1124  // transition states.
  1125  // Events are persisted locally and sent to the server, but errors are simply
  1126  // logged. Use AppendEvent to simply add a new event.
  1127  func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) {
  1128  	tr.stateLock.Lock()
  1129  	defer tr.stateLock.Unlock()
  1130  
  1131  	tr.appendEvent(event)
  1132  
  1133  	if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil {
  1134  		// Only a warning because the next event/state-transition will
  1135  		// try to persist it again.
  1136  		tr.logger.Warn("error persisting event", "error", err, "event", event)
  1137  	}
  1138  
  1139  	// Notify the alloc runner of the event
  1140  	tr.stateUpdater.TaskStateUpdated()
  1141  }
  1142  
  1143  // AppendEvent appends a new TaskEvent to this task's TaskState. The actual
  1144  // TaskState.State (pending, running, dead) is not changed. Use UpdateState to
  1145  // transition states.
  1146  // Events are persisted locally and errors are simply logged. Use EmitEvent
  1147  // also update AllocRunner.
  1148  func (tr *TaskRunner) AppendEvent(event *structs.TaskEvent) {
  1149  	tr.stateLock.Lock()
  1150  	defer tr.stateLock.Unlock()
  1151  
  1152  	tr.appendEvent(event)
  1153  
  1154  	if err := tr.stateDB.PutTaskState(tr.allocID, tr.taskName, tr.state); err != nil {
  1155  		// Only a warning because the next event/state-transition will
  1156  		// try to persist it again.
  1157  		tr.logger.Warn("error persisting event", "error", err, "event", event)
  1158  	}
  1159  }
  1160  
  1161  // appendEvent to task's event slice. Caller must acquire stateLock.
  1162  func (tr *TaskRunner) appendEvent(event *structs.TaskEvent) error {
  1163  	// Ensure the event is populated with human readable strings
  1164  	event.PopulateEventDisplayMessage()
  1165  
  1166  	// Propagate failure from event to task state
  1167  	if event.FailsTask {
  1168  		tr.state.Failed = true
  1169  	}
  1170  
  1171  	// XXX This seems like a super awkward spot for this? Why not shouldRestart?
  1172  	// Update restart metrics
  1173  	if event.Type == structs.TaskRestarting {
  1174  		if !tr.clientConfig.DisableTaggedMetrics {
  1175  			metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels)
  1176  		}
  1177  		//if r.config.BackwardsCompatibleMetrics {
  1178  		//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
  1179  		//}
  1180  		tr.state.Restarts++
  1181  		tr.state.LastRestart = time.Unix(0, event.Time)
  1182  	}
  1183  
  1184  	// Append event to slice
  1185  	appendTaskEvent(tr.state, event, tr.maxEvents)
  1186  
  1187  	return nil
  1188  }
  1189  
  1190  // WaitCh is closed when TaskRunner.Run exits.
  1191  func (tr *TaskRunner) WaitCh() <-chan struct{} {
  1192  	return tr.waitCh
  1193  }
  1194  
  1195  // Update the running allocation with a new version received from the server.
  1196  // Calls Update hooks asynchronously with Run.
  1197  //
  1198  // This method is safe for calling concurrently with Run and does not modify
  1199  // the passed in allocation.
  1200  func (tr *TaskRunner) Update(update *structs.Allocation) {
  1201  	task := update.LookupTask(tr.taskName)
  1202  	if task == nil {
  1203  		// This should not happen and likely indicates a bug in the
  1204  		// server or client.
  1205  		tr.logger.Error("allocation update is missing task; killing",
  1206  			"group", update.TaskGroup)
  1207  		te := structs.NewTaskEvent(structs.TaskKilled).
  1208  			SetKillReason("update missing task").
  1209  			SetFailsTask()
  1210  		tr.Kill(context.Background(), te)
  1211  		return
  1212  	}
  1213  
  1214  	// Update tr.alloc
  1215  	tr.setAlloc(update, task)
  1216  
  1217  	// Trigger update hooks if not terminal
  1218  	if !update.TerminalStatus() {
  1219  		tr.triggerUpdateHooks()
  1220  	}
  1221  }
  1222  
  1223  // SetNetworkIsolation is called by the PreRun allocation hook after configuring
  1224  // the network isolation for the allocation
  1225  func (tr *TaskRunner) SetNetworkIsolation(n *drivers.NetworkIsolationSpec) {
  1226  	tr.networkIsolationLock.Lock()
  1227  	tr.networkIsolationSpec = n
  1228  	tr.networkIsolationLock.Unlock()
  1229  }
  1230  
  1231  // triggerUpdate if there isn't already an update pending. Should be called
  1232  // instead of calling updateHooks directly to serialize runs of update hooks.
  1233  // TaskRunner state should be updated prior to triggering update hooks.
  1234  //
  1235  // Does not block.
  1236  func (tr *TaskRunner) triggerUpdateHooks() {
  1237  	select {
  1238  	case tr.triggerUpdateCh <- struct{}{}:
  1239  	default:
  1240  		// already an update hook pending
  1241  	}
  1242  }
  1243  
  1244  // Shutdown TaskRunner gracefully without affecting the state of the task.
  1245  // Shutdown blocks until the main Run loop exits.
  1246  func (tr *TaskRunner) Shutdown() {
  1247  	tr.logger.Trace("shutting down")
  1248  	tr.shutdownCtxCancel()
  1249  
  1250  	<-tr.WaitCh()
  1251  
  1252  	// Run shutdown hooks to cleanup
  1253  	tr.shutdownHooks()
  1254  
  1255  	// Persist once more
  1256  	tr.persistLocalState()
  1257  }
  1258  
  1259  // LatestResourceUsage returns the last resource utilization datapoint
  1260  // collected. May return nil if the task is not running or no resource
  1261  // utilization has been collected yet.
  1262  func (tr *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1263  	tr.resourceUsageLock.Lock()
  1264  	ru := tr.resourceUsage
  1265  	tr.resourceUsageLock.Unlock()
  1266  
  1267  	// Look up device statistics lazily when fetched, as currently we do not emit any stats for them yet
  1268  	if ru != nil && tr.deviceStatsReporter != nil {
  1269  		deviceResources := tr.taskResources.Devices
  1270  		ru.ResourceUsage.DeviceStats = tr.deviceStatsReporter.LatestDeviceResourceStats(deviceResources)
  1271  	}
  1272  	return ru
  1273  }
  1274  
  1275  // UpdateStats updates and emits the latest stats from the driver.
  1276  func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) {
  1277  	tr.resourceUsageLock.Lock()
  1278  	tr.resourceUsage = ru
  1279  	tr.resourceUsageLock.Unlock()
  1280  	if ru != nil {
  1281  		tr.emitStats(ru)
  1282  	}
  1283  }
  1284  
  1285  //TODO Remove Backwardscompat or use tr.Alloc()?
  1286  func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) {
  1287  	alloc := tr.Alloc()
  1288  	var allocatedMem float32
  1289  	if alloc.AllocatedResources != nil {
  1290  		if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil {
  1291  			// Convert to bytes to match other memory metrics
  1292  			allocatedMem = float32(taskRes.Memory.MemoryMB) * 1024 * 1024
  1293  		}
  1294  	} else if taskRes := alloc.TaskResources[tr.taskName]; taskRes != nil {
  1295  		// COMPAT(0.11) Remove in 0.11 when TaskResources is removed
  1296  		allocatedMem = float32(taskRes.MemoryMB) * 1024 * 1024
  1297  
  1298  	}
  1299  
  1300  	if !tr.clientConfig.DisableTaggedMetrics {
  1301  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1302  			float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels)
  1303  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"},
  1304  			float32(ru.ResourceUsage.MemoryStats.Cache), tr.baseLabels)
  1305  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"},
  1306  			float32(ru.ResourceUsage.MemoryStats.Swap), tr.baseLabels)
  1307  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "usage"},
  1308  			float32(ru.ResourceUsage.MemoryStats.Usage), tr.baseLabels)
  1309  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"},
  1310  			float32(ru.ResourceUsage.MemoryStats.MaxUsage), tr.baseLabels)
  1311  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"},
  1312  			float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels)
  1313  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"},
  1314  			float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels)
  1315  		if allocatedMem > 0 {
  1316  			metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "allocated"},
  1317  				allocatedMem, tr.baseLabels)
  1318  		}
  1319  	}
  1320  
  1321  	if tr.clientConfig.BackwardsCompatibleMetrics {
  1322  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1323  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1324  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1325  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage))
  1326  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1327  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1328  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1329  		if allocatedMem > 0 {
  1330  			metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "allocated"}, allocatedMem)
  1331  		}
  1332  	}
  1333  }
  1334  
  1335  //TODO Remove Backwardscompat or use tr.Alloc()?
  1336  func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) {
  1337  	alloc := tr.Alloc()
  1338  	var allocatedCPU float32
  1339  	if alloc.AllocatedResources != nil {
  1340  		if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil {
  1341  			allocatedCPU = float32(taskRes.Cpu.CpuShares)
  1342  		}
  1343  	} else if taskRes := alloc.TaskResources[tr.taskName]; taskRes != nil {
  1344  		// COMPAT(0.11) Remove in 0.11 when TaskResources is removed
  1345  		allocatedCPU = float32(taskRes.CPU)
  1346  	}
  1347  
  1348  	if !tr.clientConfig.DisableTaggedMetrics {
  1349  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"},
  1350  			float32(ru.ResourceUsage.CpuStats.Percent), tr.baseLabels)
  1351  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"},
  1352  			float32(ru.ResourceUsage.CpuStats.SystemMode), tr.baseLabels)
  1353  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"},
  1354  			float32(ru.ResourceUsage.CpuStats.UserMode), tr.baseLabels)
  1355  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"},
  1356  			float32(ru.ResourceUsage.CpuStats.ThrottledTime), tr.baseLabels)
  1357  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"},
  1358  			float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels)
  1359  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"},
  1360  			float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels)
  1361  		if allocatedCPU > 0 {
  1362  			metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "allocated"},
  1363  				allocatedCPU, tr.baseLabels)
  1364  		}
  1365  	}
  1366  
  1367  	if tr.clientConfig.BackwardsCompatibleMetrics {
  1368  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1369  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1370  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1371  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1372  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1373  		metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1374  		if allocatedCPU > 0 {
  1375  			metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "cpu", "allocated"}, allocatedCPU)
  1376  		}
  1377  	}
  1378  }
  1379  
  1380  // emitStats emits resource usage stats of tasks to remote metrics collector
  1381  // sinks
  1382  func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1383  	if !tr.clientConfig.PublishAllocationMetrics {
  1384  		return
  1385  	}
  1386  
  1387  	if ru.ResourceUsage.MemoryStats != nil {
  1388  		tr.setGaugeForMemory(ru)
  1389  	} else {
  1390  		tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil")
  1391  	}
  1392  
  1393  	if ru.ResourceUsage.CpuStats != nil {
  1394  		tr.setGaugeForCPU(ru)
  1395  	} else {
  1396  		tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil")
  1397  	}
  1398  }
  1399  
  1400  // appendTaskEvent updates the task status by appending the new event.
  1401  func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent, capacity int) {
  1402  	if state.Events == nil {
  1403  		state.Events = make([]*structs.TaskEvent, 1, capacity)
  1404  		state.Events[0] = event
  1405  		return
  1406  	}
  1407  
  1408  	// If we hit capacity, then shift it.
  1409  	if len(state.Events) == capacity {
  1410  		old := state.Events
  1411  		state.Events = make([]*structs.TaskEvent, 0, capacity)
  1412  		state.Events = append(state.Events, old[1:]...)
  1413  	}
  1414  
  1415  	state.Events = append(state.Events, event)
  1416  }
  1417  
  1418  func (tr *TaskRunner) TaskExecHandler() drivermanager.TaskExecHandler {
  1419  	// Check it is running
  1420  	handle := tr.getDriverHandle()
  1421  	if handle == nil {
  1422  		return nil
  1423  	}
  1424  	return handle.ExecStreaming
  1425  }
  1426  
  1427  func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) {
  1428  	return tr.driver.Capabilities()
  1429  }