github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/client/task_runner.go

github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/client/task_runner.go (about)

     1  package client
     2  
     3  import (
     4  	"bytes"
     5  	"crypto/md5"
     6  	"encoding/hex"
     7  	"fmt"
     8  	"io"
     9  	"io/ioutil"
    10  	"log"
    11  	"os"
    12  	"path/filepath"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	metrics "github.com/armon/go-metrics"
    18  	"github.com/boltdb/bolt"
    19  	"github.com/golang/snappy"
    20  	"github.com/hashicorp/consul-template/signals"
    21  	"github.com/hashicorp/go-multierror"
    22  	version "github.com/hashicorp/go-version"
    23  	"github.com/hashicorp/nomad/client/allocdir"
    24  	"github.com/hashicorp/nomad/client/config"
    25  	"github.com/hashicorp/nomad/client/driver"
    26  	"github.com/hashicorp/nomad/client/getter"
    27  	"github.com/hashicorp/nomad/client/vaultclient"
    28  	"github.com/hashicorp/nomad/nomad/structs"
    29  	"github.com/ugorji/go/codec"
    30  
    31  	"github.com/hashicorp/nomad/client/driver/env"
    32  	dstructs "github.com/hashicorp/nomad/client/driver/structs"
    33  	cstructs "github.com/hashicorp/nomad/client/structs"
    34  )
    35  
    36  const (
    37  	// killBackoffBaseline is the baseline time for exponential backoff while
    38  	// killing a task.
    39  	killBackoffBaseline = 5 * time.Second
    40  
    41  	// killBackoffLimit is the limit of the exponential backoff for killing
    42  	// the task.
    43  	killBackoffLimit = 2 * time.Minute
    44  
    45  	// killFailureLimit is how many times we will attempt to kill a task before
    46  	// giving up and potentially leaking resources.
    47  	killFailureLimit = 5
    48  
    49  	// vaultBackoffBaseline is the baseline time for exponential backoff when
    50  	// attempting to retrieve a Vault token
    51  	vaultBackoffBaseline = 5 * time.Second
    52  
    53  	// vaultBackoffLimit is the limit of the exponential backoff when attempting
    54  	// to retrieve a Vault token
    55  	vaultBackoffLimit = 3 * time.Minute
    56  
    57  	// vaultTokenFile is the name of the file holding the Vault token inside the
    58  	// task's secret directory
    59  	vaultTokenFile = "vault_token"
    60  )
    61  
    62  var (
    63  	// taskRunnerStateAllKey holds all the task runners state. At the moment
    64  	// there is no need to split it
    65  	taskRunnerStateAllKey = []byte("simple-all")
    66  )
    67  
    68  // taskRestartEvent wraps a TaskEvent with additional metadata to control
    69  // restart behavior.
    70  type taskRestartEvent struct {
    71  	// taskEvent to report
    72  	taskEvent *structs.TaskEvent
    73  
    74  	// if false, don't count against restart count
    75  	failure bool
    76  }
    77  
    78  func newTaskRestartEvent(reason string, failure bool) *taskRestartEvent {
    79  	return &taskRestartEvent{
    80  		taskEvent: structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason),
    81  		failure:   failure,
    82  	}
    83  }
    84  
    85  // TaskRunner is used to wrap a task within an allocation and provide the execution context.
    86  type TaskRunner struct {
    87  	stateDB        *bolt.DB
    88  	config         *config.Config
    89  	updater        TaskStateUpdater
    90  	logger         *log.Logger
    91  	restartTracker *RestartTracker
    92  	consul         ConsulServiceAPI
    93  
    94  	// running marks whether the task is running
    95  	running     bool
    96  	runningLock sync.Mutex
    97  
    98  	resourceUsage     *cstructs.TaskResourceUsage
    99  	resourceUsageLock sync.RWMutex
   100  
   101  	alloc   *structs.Allocation
   102  	task    *structs.Task
   103  	taskDir *allocdir.TaskDir
   104  
   105  	// envBuilder is used to build the task's environment
   106  	envBuilder *env.Builder
   107  
   108  	// driverNet is the network information returned by the driver
   109  	driverNet     *cstructs.DriverNetwork
   110  	driverNetLock sync.Mutex
   111  
   112  	// updateCh is used to receive updated versions of the allocation
   113  	updateCh chan *structs.Allocation
   114  
   115  	handle     driver.DriverHandle
   116  	handleLock sync.Mutex
   117  
   118  	// artifactsDownloaded tracks whether the tasks artifacts have been
   119  	// downloaded
   120  	//
   121  	// Must acquire persistLock when accessing
   122  	artifactsDownloaded bool
   123  
   124  	// taskDirBuilt tracks whether the task has built its directory.
   125  	//
   126  	// Must acquire persistLock when accessing
   127  	taskDirBuilt bool
   128  
   129  	// createdResources are all the resources created by the task driver
   130  	// across all attempts to start the task.
   131  	// Simple gets and sets should use {get,set}CreatedResources
   132  	createdResources     *driver.CreatedResources
   133  	createdResourcesLock sync.Mutex
   134  
   135  	// payloadRendered tracks whether the payload has been rendered to disk
   136  	payloadRendered bool
   137  
   138  	// vaultFuture is the means to wait for and get a Vault token
   139  	vaultFuture *tokenFuture
   140  
   141  	// recoveredVaultToken is the token that was recovered through a restore
   142  	recoveredVaultToken string
   143  
   144  	// vaultClient is used to retrieve and renew any needed Vault token
   145  	vaultClient vaultclient.VaultClient
   146  
   147  	// templateManager is used to manage any consul-templates this task may have
   148  	templateManager *TaskTemplateManager
   149  
   150  	// startCh is used to trigger the start of the task
   151  	startCh chan struct{}
   152  
   153  	// unblockCh is used to unblock the starting of the task
   154  	unblockCh   chan struct{}
   155  	unblocked   bool
   156  	unblockLock sync.Mutex
   157  
   158  	// restartCh is used to restart a task
   159  	restartCh chan *taskRestartEvent
   160  
   161  	// signalCh is used to send a signal to a task
   162  	signalCh chan SignalEvent
   163  
   164  	destroy      bool
   165  	destroyCh    chan struct{}
   166  	destroyLock  sync.Mutex
   167  	destroyEvent *structs.TaskEvent
   168  
   169  	// waitCh closing marks the run loop as having exited
   170  	waitCh chan struct{}
   171  
   172  	// persistLock must be acquired when accessing fields stored by
   173  	// SaveState. SaveState is called asynchronously to TaskRunner.Run by
   174  	// AllocRunner, so all state fields must be synchronized using this
   175  	// lock.
   176  	persistLock sync.Mutex
   177  
   178  	// persistedHash is the hash of the last persisted snapshot. It is used to
   179  	// detect if a new snapshot has to be written to disk.
   180  	persistedHash []byte
   181  
   182  	// baseLabels are used when emitting tagged metrics. All task runner metrics
   183  	// will have these tags, and optionally more.
   184  	baseLabels []metrics.Label
   185  }
   186  
   187  // taskRunnerState is used to snapshot the state of the task runner
   188  type taskRunnerState struct {
   189  	Version            string
   190  	HandleID           string
   191  	ArtifactDownloaded bool
   192  	TaskDirBuilt       bool
   193  	PayloadRendered    bool
   194  	CreatedResources   *driver.CreatedResources
   195  	DriverNetwork      *cstructs.DriverNetwork
   196  }
   197  
   198  func (s *taskRunnerState) Hash() []byte {
   199  	h := md5.New()
   200  
   201  	io.WriteString(h, s.Version)
   202  	io.WriteString(h, s.HandleID)
   203  	io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded))
   204  	io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt))
   205  	io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered))
   206  	h.Write(s.CreatedResources.Hash())
   207  	h.Write(s.DriverNetwork.Hash())
   208  
   209  	return h.Sum(nil)
   210  }
   211  
   212  // TaskStateUpdater is used to signal that tasks state has changed. If lazySync
   213  // is set the event won't be immediately pushed to the server.
   214  type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent, lazySync bool)
   215  
   216  // SignalEvent is a tuple of the signal and the event generating it
   217  type SignalEvent struct {
   218  	// s is the signal to be sent
   219  	s os.Signal
   220  
   221  	// e is the task event generating the signal
   222  	e *structs.TaskEvent
   223  
   224  	// result should be used to send back the result of the signal
   225  	result chan<- error
   226  }
   227  
   228  // NewTaskRunner is used to create a new task context
   229  func NewTaskRunner(logger *log.Logger, config *config.Config,
   230  	stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir,
   231  	alloc *structs.Allocation, task *structs.Task,
   232  	vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner {
   233  
   234  	// Merge in the task resources
   235  	task.Resources = alloc.TaskResources[task.Name]
   236  
   237  	// Build the restart tracker.
   238  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   239  	if tg == nil {
   240  		logger.Printf("[ERR] client: alloc %q for missing task group %q", alloc.ID, alloc.TaskGroup)
   241  		return nil
   242  	}
   243  	restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type)
   244  
   245  	// Initialize the environment builder
   246  	envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region)
   247  
   248  	tc := &TaskRunner{
   249  		config:           config,
   250  		stateDB:          stateDB,
   251  		updater:          updater,
   252  		logger:           logger,
   253  		restartTracker:   restartTracker,
   254  		alloc:            alloc,
   255  		task:             task,
   256  		taskDir:          taskDir,
   257  		envBuilder:       envBuilder,
   258  		createdResources: driver.NewCreatedResources(),
   259  		consul:           consulClient,
   260  		vaultClient:      vaultClient,
   261  		vaultFuture:      NewTokenFuture().Set(""),
   262  		updateCh:         make(chan *structs.Allocation, 64),
   263  		destroyCh:        make(chan struct{}),
   264  		waitCh:           make(chan struct{}),
   265  		startCh:          make(chan struct{}, 1),
   266  		unblockCh:        make(chan struct{}),
   267  		restartCh:        make(chan *taskRestartEvent),
   268  		signalCh:         make(chan SignalEvent),
   269  	}
   270  
   271  	tc.baseLabels = []metrics.Label{
   272  		{
   273  			Name:  "job",
   274  			Value: tc.alloc.Job.Name,
   275  		},
   276  		{
   277  			Name:  "task_group",
   278  			Value: tc.alloc.TaskGroup,
   279  		},
   280  		{
   281  			Name:  "alloc_id",
   282  			Value: tc.alloc.ID,
   283  		},
   284  		{
   285  			Name:  "task",
   286  			Value: tc.task.Name,
   287  		},
   288  	}
   289  
   290  	return tc
   291  }
   292  
   293  // MarkReceived marks the task as received.
   294  func (r *TaskRunner) MarkReceived() {
   295  	// We lazy sync this since there will be a follow up message almost
   296  	// immediately.
   297  	r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived), true)
   298  }
   299  
   300  // WaitCh returns a channel to wait for termination
   301  func (r *TaskRunner) WaitCh() <-chan struct{} {
   302  	return r.waitCh
   303  }
   304  
   305  // getHandle returns the task's handle or nil
   306  func (r *TaskRunner) getHandle() driver.DriverHandle {
   307  	r.handleLock.Lock()
   308  	h := r.handle
   309  	r.handleLock.Unlock()
   310  	return h
   311  }
   312  
   313  // pre060StateFilePath returns the path to our state file that would have been
   314  // written pre v0.6.0
   315  // COMPAT: Remove in 0.7.0
   316  func (r *TaskRunner) pre060StateFilePath() string {
   317  	// Get the MD5 of the task name
   318  	hashVal := md5.Sum([]byte(r.task.Name))
   319  	hashHex := hex.EncodeToString(hashVal[:])
   320  	dirName := fmt.Sprintf("task-%s", hashHex)
   321  
   322  	// Generate the path
   323  	return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json")
   324  }
   325  
   326  // RestoreState is used to restore our state. If a non-empty string is returned
   327  // the task is restarted with the string as the reason. This is useful for
   328  // backwards incompatible upgrades that need to restart tasks with a new
   329  // executor.
   330  func (r *TaskRunner) RestoreState() (string, error) {
   331  	// COMPAT: Remove in 0.7.0
   332  	// 0.6.0 transistioned from individual state files to a single bolt-db.
   333  	// The upgrade path is to:
   334  	// Check if old state exists
   335  	//   If so, restore from that and delete old state
   336  	// Restore using state database
   337  
   338  	var snap taskRunnerState
   339  
   340  	// Check if the old snapshot is there
   341  	oldPath := r.pre060StateFilePath()
   342  	if err := pre060RestoreState(oldPath, &snap); err == nil {
   343  		// Delete the old state
   344  		os.RemoveAll(oldPath)
   345  	} else if !os.IsNotExist(err) {
   346  		// Something corrupt in the old state file
   347  		return "", err
   348  	} else {
   349  		// We are doing a normal restore
   350  		err := r.stateDB.View(func(tx *bolt.Tx) error {
   351  			bkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
   352  			if err != nil {
   353  				return fmt.Errorf("failed to get task bucket: %v", err)
   354  			}
   355  
   356  			if err := getObject(bkt, taskRunnerStateAllKey, &snap); err != nil {
   357  				return fmt.Errorf("failed to read task runner state: %v", err)
   358  			}
   359  			return nil
   360  		})
   361  		if err != nil {
   362  			return "", err
   363  		}
   364  
   365  	}
   366  
   367  	// Restore fields from the snapshot
   368  	r.artifactsDownloaded = snap.ArtifactDownloaded
   369  	r.taskDirBuilt = snap.TaskDirBuilt
   370  	r.payloadRendered = snap.PayloadRendered
   371  	r.setCreatedResources(snap.CreatedResources)
   372  	r.driverNet = snap.DriverNetwork
   373  
   374  	if r.task.Vault != nil {
   375  		// Read the token from the secret directory
   376  		tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   377  		data, err := ioutil.ReadFile(tokenPath)
   378  		if err != nil {
   379  			if !os.IsNotExist(err) {
   380  				return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   381  			}
   382  
   383  			// Token file doesn't exist
   384  		} else {
   385  			// Store the recovered token
   386  			r.recoveredVaultToken = string(data)
   387  		}
   388  	}
   389  
   390  	// Restore the driver
   391  	restartReason := ""
   392  	if snap.HandleID != "" {
   393  		d, err := r.createDriver()
   394  		if err != nil {
   395  			return "", err
   396  		}
   397  
   398  		// Add the restored network driver to the environment
   399  		r.envBuilder.SetDriverNetwork(r.driverNet)
   400  
   401  		// Open a connection to the driver handle
   402  		ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
   403  		handle, err := d.Open(ctx, snap.HandleID)
   404  
   405  		// In the case it fails, we relaunch the task in the Run() method.
   406  		if err != nil {
   407  			r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v",
   408  				r.task.Name, r.alloc.ID, err)
   409  			return "", nil
   410  		}
   411  
   412  		if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) {
   413  			restartReason = pre06ScriptCheckReason
   414  		}
   415  
   416  		if err := r.registerServices(d, handle, r.driverNet); err != nil {
   417  			// Don't hard fail here as there's a chance this task
   418  			// registered with Consul properly when it initial
   419  			// started.
   420  			r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v",
   421  				r.task.Name, r.alloc.ID, err)
   422  		}
   423  
   424  		r.handleLock.Lock()
   425  		r.handle = handle
   426  		r.handleLock.Unlock()
   427  
   428  		r.runningLock.Lock()
   429  		r.running = true
   430  		r.runningLock.Unlock()
   431  	}
   432  	return restartReason, nil
   433  }
   434  
   435  // ver06 is used for checking for pre-0.6 script checks
   436  var ver06 = version.Must(version.NewVersion("0.6.0dev"))
   437  
   438  // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script
   439  // check is found on an exec/java task.
   440  const pre06ScriptCheckReason = "upgrading pre-0.6 script checks"
   441  
   442  // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script
   443  // check, and uses exec or java drivers.
   444  func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool {
   445  	if driver != "exec" && driver != "java" && driver != "mock_driver" {
   446  		// Only exec and java are affected
   447  		return false
   448  	}
   449  	v, err := version.NewVersion(ver)
   450  	if err != nil {
   451  		// Treat it as old
   452  		return true
   453  	}
   454  	if !v.LessThan(ver06) {
   455  		// >= 0.6.0dev
   456  		return false
   457  	}
   458  	for _, service := range services {
   459  		for _, check := range service.Checks {
   460  			if check.Type == "script" {
   461  				return true
   462  			}
   463  		}
   464  	}
   465  	return false
   466  }
   467  
   468  // SaveState is used to snapshot our state
   469  func (r *TaskRunner) SaveState() error {
   470  	r.destroyLock.Lock()
   471  	defer r.destroyLock.Unlock()
   472  	if r.destroy {
   473  		// Don't save state if already destroyed
   474  		return nil
   475  	}
   476  
   477  	r.persistLock.Lock()
   478  	defer r.persistLock.Unlock()
   479  	snap := taskRunnerState{
   480  		Version:            r.config.Version.VersionNumber(),
   481  		ArtifactDownloaded: r.artifactsDownloaded,
   482  		TaskDirBuilt:       r.taskDirBuilt,
   483  		PayloadRendered:    r.payloadRendered,
   484  		CreatedResources:   r.getCreatedResources(),
   485  	}
   486  
   487  	r.handleLock.Lock()
   488  	if r.handle != nil {
   489  		snap.HandleID = r.handle.ID()
   490  	}
   491  	r.handleLock.Unlock()
   492  
   493  	r.driverNetLock.Lock()
   494  	snap.DriverNetwork = r.driverNet.Copy()
   495  	r.driverNetLock.Unlock()
   496  
   497  	// If nothing has changed avoid the write
   498  	h := snap.Hash()
   499  	if bytes.Equal(h, r.persistedHash) {
   500  		return nil
   501  	}
   502  
   503  	// Serialize the object
   504  	var buf bytes.Buffer
   505  	if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil {
   506  		return fmt.Errorf("failed to serialize snapshot: %v", err)
   507  	}
   508  
   509  	// Start the transaction.
   510  	return r.stateDB.Batch(func(tx *bolt.Tx) error {
   511  		// Grab the task bucket
   512  		taskBkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
   513  		if err != nil {
   514  			return fmt.Errorf("failed to retrieve allocation bucket: %v", err)
   515  		}
   516  
   517  		if err := putData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil {
   518  			return fmt.Errorf("failed to write task_runner state: %v", err)
   519  		}
   520  
   521  		// Store the hash that was persisted
   522  		tx.OnCommit(func() {
   523  			r.persistedHash = h
   524  		})
   525  
   526  		return nil
   527  	})
   528  }
   529  
   530  // DestroyState is used to cleanup after ourselves
   531  func (r *TaskRunner) DestroyState() error {
   532  	r.persistLock.Lock()
   533  	defer r.persistLock.Unlock()
   534  
   535  	return r.stateDB.Update(func(tx *bolt.Tx) error {
   536  		if err := deleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil {
   537  			return fmt.Errorf("failed to delete task bucket: %v", err)
   538  		}
   539  		return nil
   540  	})
   541  }
   542  
   543  // setState is used to update the state of the task runner
   544  func (r *TaskRunner) setState(state string, event *structs.TaskEvent, lazySync bool) {
   545  	event.PopulateEventDisplayMessage()
   546  
   547  	// Persist our state to disk.
   548  	if err := r.SaveState(); err != nil {
   549  		r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err)
   550  	}
   551  
   552  	// Indicate the task has been updated.
   553  	r.updater(r.task.Name, state, event, lazySync)
   554  }
   555  
   556  // createDriver makes a driver for the task
   557  func (r *TaskRunner) createDriver() (driver.Driver, error) {
   558  	// Create a task-specific event emitter callback to expose minimal
   559  	// state to drivers
   560  	eventEmitter := func(m string, args ...interface{}) {
   561  		msg := fmt.Sprintf(m, args...)
   562  		r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg)
   563  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg), false)
   564  	}
   565  
   566  	driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter)
   567  	d, err := driver.NewDriver(r.task.Driver, driverCtx)
   568  	if err != nil {
   569  		return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
   570  			r.task.Driver, r.alloc.ID, err)
   571  	}
   572  
   573  	return d, err
   574  }
   575  
   576  // Run is a long running routine used to manage the task
   577  func (r *TaskRunner) Run() {
   578  	defer close(r.waitCh)
   579  	r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
   580  		r.task.Name, r.alloc.ID)
   581  
   582  	if err := r.validateTask(); err != nil {
   583  		r.setState(
   584  			structs.TaskStateDead,
   585  			structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask(),
   586  			false)
   587  		return
   588  	}
   589  
   590  	// Create a temporary driver so that we can determine the FSIsolation
   591  	// required. run->startTask will create a new driver after environment
   592  	// has been setup (env vars, templates, artifacts, secrets, etc).
   593  	tmpDrv, err := r.createDriver()
   594  	if err != nil {
   595  		e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   596  		r.setState(
   597  			structs.TaskStateDead,
   598  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
   599  			false)
   600  		return
   601  	}
   602  
   603  	// Build base task directory structure regardless of FS isolation abilities.
   604  	// This needs to happen before we start the Vault manager and call prestart
   605  	// as both those can write to the task directories
   606  	if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil {
   607  		e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err)
   608  		r.setState(
   609  			structs.TaskStateDead,
   610  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
   611  			false)
   612  		return
   613  	}
   614  
   615  	// If there is no Vault policy leave the static future created in
   616  	// NewTaskRunner
   617  	if r.task.Vault != nil {
   618  		// Start the go-routine to get a Vault token
   619  		r.vaultFuture.Clear()
   620  		go r.vaultManager(r.recoveredVaultToken)
   621  	}
   622  
   623  	// Start the run loop
   624  	r.run()
   625  
   626  	// Do any cleanup necessary
   627  	r.postrun()
   628  
   629  	return
   630  }
   631  
   632  // validateTask validates the fields of the task and returns an error if the
   633  // task is invalid.
   634  func (r *TaskRunner) validateTask() error {
   635  	var mErr multierror.Error
   636  
   637  	// Validate the user.
   638  	unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist)
   639  	checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers)
   640  	if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch {
   641  		if _, unallowed := unallowedUsers[r.task.User]; unallowed {
   642  			mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User))
   643  		}
   644  	}
   645  
   646  	// Validate the artifacts
   647  	for i, artifact := range r.task.Artifacts {
   648  		// Verify the artifact doesn't escape the task directory.
   649  		if err := artifact.Validate(); err != nil {
   650  			// If this error occurs there is potentially a server bug or
   651  			// mallicious, server spoofing.
   652  			r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v",
   653  				r.alloc.ID, r.task.Name, artifact, i, err)
   654  			mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err))
   655  		}
   656  	}
   657  
   658  	// Validate the Service names
   659  	taskEnv := r.envBuilder.Build()
   660  	for i, service := range r.task.Services {
   661  		name := taskEnv.ReplaceEnv(service.Name)
   662  		if err := service.ValidateName(name); err != nil {
   663  			mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err))
   664  		}
   665  	}
   666  
   667  	if len(mErr.Errors) == 1 {
   668  		return mErr.Errors[0]
   669  	}
   670  	return mErr.ErrorOrNil()
   671  }
   672  
   673  // tokenFuture stores the Vault token and allows consumers to block till a valid
   674  // token exists
   675  type tokenFuture struct {
   676  	waiting []chan struct{}
   677  	token   string
   678  	set     bool
   679  	m       sync.Mutex
   680  }
   681  
   682  // NewTokenFuture returns a new token future without any token set
   683  func NewTokenFuture() *tokenFuture {
   684  	return &tokenFuture{}
   685  }
   686  
   687  // Wait returns a channel that can be waited on. When this channel unblocks, a
   688  // valid token will be available via the Get method
   689  func (f *tokenFuture) Wait() <-chan struct{} {
   690  	f.m.Lock()
   691  	defer f.m.Unlock()
   692  
   693  	c := make(chan struct{})
   694  	if f.set {
   695  		close(c)
   696  		return c
   697  	}
   698  
   699  	f.waiting = append(f.waiting, c)
   700  	return c
   701  }
   702  
   703  // Set sets the token value and unblocks any caller of Wait
   704  func (f *tokenFuture) Set(token string) *tokenFuture {
   705  	f.m.Lock()
   706  	defer f.m.Unlock()
   707  
   708  	f.set = true
   709  	f.token = token
   710  	for _, w := range f.waiting {
   711  		close(w)
   712  	}
   713  	f.waiting = nil
   714  	return f
   715  }
   716  
   717  // Clear clears the set vault token.
   718  func (f *tokenFuture) Clear() *tokenFuture {
   719  	f.m.Lock()
   720  	defer f.m.Unlock()
   721  
   722  	f.token = ""
   723  	f.set = false
   724  	return f
   725  }
   726  
   727  // Get returns the set Vault token
   728  func (f *tokenFuture) Get() string {
   729  	f.m.Lock()
   730  	defer f.m.Unlock()
   731  	return f.token
   732  }
   733  
   734  // vaultManager should be called in a go-routine and manages the derivation,
   735  // renewal and handling of errors with the Vault token. The optional parameter
   736  // allows setting the initial Vault token. This is useful when the Vault token
   737  // is recovered off disk.
   738  func (r *TaskRunner) vaultManager(token string) {
   739  	// Helper for stopping token renewal
   740  	stopRenewal := func() {
   741  		if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil {
   742  			r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   743  		}
   744  	}
   745  
   746  	// updatedToken lets us store state between loops. If true, a new token
   747  	// has been retrieved and we need to apply the Vault change mode
   748  	var updatedToken bool
   749  
   750  OUTER:
   751  	for {
   752  		// Check if we should exit
   753  		select {
   754  		case <-r.waitCh:
   755  			stopRenewal()
   756  			return
   757  		default:
   758  		}
   759  
   760  		// Clear the token
   761  		r.vaultFuture.Clear()
   762  
   763  		// Check if there already is a token which can be the case for
   764  		// restoring the TaskRunner
   765  		if token == "" {
   766  			// Get a token
   767  			var exit bool
   768  			token, exit = r.deriveVaultToken()
   769  			if exit {
   770  				// Exit the manager
   771  				return
   772  			}
   773  
   774  			// Write the token to disk
   775  			if err := r.writeToken(token); err != nil {
   776  				e := fmt.Errorf("failed to write Vault token to disk")
   777  				r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err)
   778  				r.Kill("vault", e.Error(), true)
   779  				return
   780  			}
   781  		}
   782  
   783  		// Start the renewal process
   784  		renewCh, err := r.vaultClient.RenewToken(token, 30)
   785  
   786  		// An error returned means the token is not being renewed
   787  		if err != nil {
   788  			r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   789  			token = ""
   790  			goto OUTER
   791  		}
   792  
   793  		// The Vault token is valid now, so set it
   794  		r.vaultFuture.Set(token)
   795  
   796  		if updatedToken {
   797  			switch r.task.Vault.ChangeMode {
   798  			case structs.VaultChangeModeSignal:
   799  				s, err := signals.Parse(r.task.Vault.ChangeSignal)
   800  				if err != nil {
   801  					e := fmt.Errorf("failed to parse signal: %v", err)
   802  					r.logger.Printf("[ERR] client: %v", err)
   803  					r.Kill("vault", e.Error(), true)
   804  					return
   805  				}
   806  
   807  				if err := r.Signal("vault", "new Vault token acquired", s); err != nil {
   808  					r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   809  					r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true)
   810  					return
   811  				}
   812  			case structs.VaultChangeModeRestart:
   813  				const noFailure = false
   814  				r.Restart("vault", "new Vault token acquired", noFailure)
   815  			case structs.VaultChangeModeNoop:
   816  				fallthrough
   817  			default:
   818  				r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode)
   819  			}
   820  
   821  			// We have handled it
   822  			updatedToken = false
   823  
   824  			// Call the handler
   825  			r.updatedTokenHandler()
   826  		}
   827  
   828  		// Start watching for renewal errors
   829  		select {
   830  		case err := <-renewCh:
   831  			// Clear the token
   832  			token = ""
   833  			r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   834  			stopRenewal()
   835  
   836  			// Check if we have to do anything
   837  			if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop {
   838  				updatedToken = true
   839  			}
   840  		case <-r.waitCh:
   841  			stopRenewal()
   842  			return
   843  		}
   844  	}
   845  }
   846  
   847  // deriveVaultToken derives the Vault token using exponential backoffs. It
   848  // returns the Vault token and whether the manager should exit.
   849  func (r *TaskRunner) deriveVaultToken() (token string, exit bool) {
   850  	attempts := 0
   851  	for {
   852  		tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name})
   853  		if err == nil {
   854  			return tokens[r.task.Name], false
   855  		}
   856  
   857  		// Check if we can't recover from the error
   858  		if !structs.IsRecoverable(err) {
   859  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   860  				r.task.Name, r.alloc.ID, err)
   861  			r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true)
   862  			return "", true
   863  		}
   864  
   865  		// Handle the retry case
   866  		backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
   867  		if backoff > vaultBackoffLimit {
   868  			backoff = vaultBackoffLimit
   869  		}
   870  		r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v",
   871  			r.task.Name, r.alloc.ID, err, backoff)
   872  
   873  		attempts++
   874  
   875  		// Wait till retrying
   876  		select {
   877  		case <-r.waitCh:
   878  			return "", true
   879  		case <-time.After(backoff):
   880  		}
   881  	}
   882  }
   883  
   884  // writeToken writes the given token to disk
   885  func (r *TaskRunner) writeToken(token string) error {
   886  	tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   887  	if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil {
   888  		return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   889  	}
   890  
   891  	return nil
   892  }
   893  
   894  // updatedTokenHandler is called when a new Vault token is retrieved. Things
   895  // that rely on the token should be updated here.
   896  func (r *TaskRunner) updatedTokenHandler() {
   897  
   898  	// Update the tasks environment
   899  	r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env)
   900  
   901  	if r.templateManager != nil {
   902  		r.templateManager.Stop()
   903  
   904  		// Create a new templateManager
   905  		var err error
   906  		r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
   907  			Hooks:                r,
   908  			Templates:            r.task.Templates,
   909  			ClientConfig:         r.config,
   910  			VaultToken:           r.vaultFuture.Get(),
   911  			TaskDir:              r.taskDir.Dir,
   912  			EnvBuilder:           r.envBuilder,
   913  			MaxTemplateEventRate: DefaultMaxTemplateEventRate,
   914  		})
   915  
   916  		if err != nil {
   917  			err := fmt.Errorf("failed to build task's template manager: %v", err)
   918  			r.setState(structs.TaskStateDead,
   919  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   920  				false)
   921  			r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
   922  			r.Kill("vault", err.Error(), true)
   923  			return
   924  		}
   925  	}
   926  }
   927  
   928  // prestart handles life-cycle tasks that occur before the task has started.
   929  // Since it's run asynchronously with the main Run() loop the alloc & task are
   930  // passed in to avoid racing with updates.
   931  func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) {
   932  	if task.Vault != nil {
   933  		// Wait for the token
   934  		r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID)
   935  		tokenCh := r.vaultFuture.Wait()
   936  		select {
   937  		case <-tokenCh:
   938  		case <-r.waitCh:
   939  			resultCh <- false
   940  			return
   941  		}
   942  		r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID)
   943  		r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env)
   944  	}
   945  
   946  	// If the job is a dispatch job and there is a payload write it to disk
   947  	requirePayload := len(alloc.Job.Payload) != 0 &&
   948  		(r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "")
   949  	if !r.payloadRendered && requirePayload {
   950  		renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File)
   951  		decoded, err := snappy.Decode(nil, alloc.Job.Payload)
   952  		if err != nil {
   953  			r.setState(
   954  				structs.TaskStateDead,
   955  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   956  				false)
   957  			resultCh <- false
   958  			return
   959  		}
   960  
   961  		if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil {
   962  			r.setState(
   963  				structs.TaskStateDead,
   964  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   965  				false)
   966  			resultCh <- false
   967  			return
   968  		}
   969  
   970  		if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil {
   971  			r.setState(
   972  				structs.TaskStateDead,
   973  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   974  				false)
   975  			resultCh <- false
   976  			return
   977  		}
   978  
   979  		r.payloadRendered = true
   980  	}
   981  
   982  	for {
   983  		r.persistLock.Lock()
   984  		downloaded := r.artifactsDownloaded
   985  		r.persistLock.Unlock()
   986  
   987  		// Download the task's artifacts
   988  		if !downloaded && len(task.Artifacts) > 0 {
   989  			r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts), false)
   990  			taskEnv := r.envBuilder.Build()
   991  			for _, artifact := range task.Artifacts {
   992  				if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil {
   993  					wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err)
   994  					r.logger.Printf("[DEBUG] client: %v", wrapped)
   995  					r.setState(structs.TaskStatePending,
   996  						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped), false)
   997  					r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err))
   998  					goto RESTART
   999  				}
  1000  			}
  1001  
  1002  			r.persistLock.Lock()
  1003  			r.artifactsDownloaded = true
  1004  			r.persistLock.Unlock()
  1005  		}
  1006  
  1007  		// We don't have to wait for any template
  1008  		if len(task.Templates) == 0 {
  1009  			// Send the start signal
  1010  			select {
  1011  			case r.startCh <- struct{}{}:
  1012  			default:
  1013  			}
  1014  
  1015  			resultCh <- true
  1016  			return
  1017  		}
  1018  
  1019  		// Build the template manager
  1020  		if r.templateManager == nil {
  1021  			var err error
  1022  			r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
  1023  				Hooks:                r,
  1024  				Templates:            r.task.Templates,
  1025  				ClientConfig:         r.config,
  1026  				VaultToken:           r.vaultFuture.Get(),
  1027  				TaskDir:              r.taskDir.Dir,
  1028  				EnvBuilder:           r.envBuilder,
  1029  				MaxTemplateEventRate: DefaultMaxTemplateEventRate,
  1030  			})
  1031  			if err != nil {
  1032  				err := fmt.Errorf("failed to build task's template manager: %v", err)
  1033  				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), false)
  1034  				r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err)
  1035  				resultCh <- false
  1036  				return
  1037  			}
  1038  		}
  1039  
  1040  		// Block for consul-template
  1041  		// TODO Hooks should register themselves as blocking and then we can
  1042  		// perioidcally enumerate what we are still blocked on
  1043  		select {
  1044  		case <-r.unblockCh:
  1045  			// Send the start signal
  1046  			select {
  1047  			case r.startCh <- struct{}{}:
  1048  			default:
  1049  			}
  1050  
  1051  			resultCh <- true
  1052  			return
  1053  		case <-r.waitCh:
  1054  			// The run loop has exited so exit too
  1055  			resultCh <- false
  1056  			return
  1057  		}
  1058  
  1059  	RESTART:
  1060  		restart := r.shouldRestart()
  1061  		if !restart {
  1062  			resultCh <- false
  1063  			return
  1064  		}
  1065  	}
  1066  }
  1067  
  1068  // postrun is used to do any cleanup that is necessary after exiting the runloop
  1069  func (r *TaskRunner) postrun() {
  1070  	// Stop the template manager
  1071  	if r.templateManager != nil {
  1072  		r.templateManager.Stop()
  1073  	}
  1074  }
  1075  
  1076  // run is the main run loop that handles starting the application, destroying
  1077  // it, restarts and signals.
  1078  func (r *TaskRunner) run() {
  1079  	// Predeclare things so we can jump to the RESTART
  1080  	var stopCollection chan struct{}
  1081  	var handleWaitCh chan *dstructs.WaitResult
  1082  
  1083  	// If we already have a handle, populate the stopCollection and handleWaitCh
  1084  	// to fix the invariant that it exists.
  1085  	handleEmpty := r.getHandle() == nil
  1086  
  1087  	if !handleEmpty {
  1088  		stopCollection = make(chan struct{})
  1089  		go r.collectResourceUsageStats(stopCollection)
  1090  		handleWaitCh = r.handle.WaitCh()
  1091  	}
  1092  
  1093  	for {
  1094  		// Do the prestart activities
  1095  		prestartResultCh := make(chan bool, 1)
  1096  		go r.prestart(r.alloc, r.task, prestartResultCh)
  1097  
  1098  	WAIT:
  1099  		for {
  1100  			select {
  1101  			case success := <-prestartResultCh:
  1102  				if !success {
  1103  					r.cleanup()
  1104  					r.setState(structs.TaskStateDead, nil, false)
  1105  					return
  1106  				}
  1107  			case <-r.startCh:
  1108  				// Start the task if not yet started or it is being forced. This logic
  1109  				// is necessary because in the case of a restore the handle already
  1110  				// exists.
  1111  				handleEmpty := r.getHandle() == nil
  1112  				if handleEmpty {
  1113  					startErr := r.startTask()
  1114  					r.restartTracker.SetStartError(startErr)
  1115  					if startErr != nil {
  1116  						r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr), true)
  1117  						goto RESTART
  1118  					}
  1119  
  1120  					// Mark the task as started
  1121  					r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted), false)
  1122  					r.runningLock.Lock()
  1123  					r.running = true
  1124  					r.runningLock.Unlock()
  1125  
  1126  					if stopCollection == nil {
  1127  						stopCollection = make(chan struct{})
  1128  						go r.collectResourceUsageStats(stopCollection)
  1129  					}
  1130  
  1131  					handleWaitCh = r.handle.WaitCh()
  1132  				}
  1133  
  1134  			case waitRes := <-handleWaitCh:
  1135  				if waitRes == nil {
  1136  					panic("nil wait")
  1137  				}
  1138  
  1139  				r.runningLock.Lock()
  1140  				r.running = false
  1141  				r.runningLock.Unlock()
  1142  
  1143  				// Stop collection of the task's resource usage
  1144  				close(stopCollection)
  1145  
  1146  				// Log whether the task was successful or not.
  1147  				r.restartTracker.SetWaitResult(waitRes)
  1148  				r.setState("", r.waitErrorToEvent(waitRes), true)
  1149  				if !waitRes.Successful() {
  1150  					r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes)
  1151  				} else {
  1152  					r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID)
  1153  				}
  1154  
  1155  				break WAIT
  1156  			case update := <-r.updateCh:
  1157  				if err := r.handleUpdate(update); err != nil {
  1158  					r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err)
  1159  				}
  1160  
  1161  			case se := <-r.signalCh:
  1162  				r.runningLock.Lock()
  1163  				running := r.running
  1164  				r.runningLock.Unlock()
  1165  				common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID)
  1166  				if !running {
  1167  					// Send no error
  1168  					r.logger.Printf("[DEBUG] client: skipping %s", common)
  1169  					se.result <- nil
  1170  					continue
  1171  				}
  1172  
  1173  				r.logger.Printf("[DEBUG] client: sending %s", common)
  1174  				r.setState(structs.TaskStateRunning, se.e, false)
  1175  
  1176  				res := r.handle.Signal(se.s)
  1177  				se.result <- res
  1178  
  1179  			case restartEvent := <-r.restartCh:
  1180  				r.runningLock.Lock()
  1181  				running := r.running
  1182  				r.runningLock.Unlock()
  1183  				common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID)
  1184  				if !running {
  1185  					r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common)
  1186  					continue
  1187  				}
  1188  
  1189  				r.logger.Printf("[DEBUG] client: restarting %s: %v", common, restartEvent.taskEvent.RestartReason)
  1190  				r.setState(structs.TaskStateRunning, restartEvent.taskEvent, false)
  1191  				r.killTask(nil)
  1192  
  1193  				close(stopCollection)
  1194  
  1195  				if handleWaitCh != nil {
  1196  					<-handleWaitCh
  1197  				}
  1198  
  1199  				r.restartTracker.SetRestartTriggered(restartEvent.failure)
  1200  				break WAIT
  1201  
  1202  			case <-r.destroyCh:
  1203  				r.runningLock.Lock()
  1204  				running := r.running
  1205  				r.runningLock.Unlock()
  1206  				if !running {
  1207  					r.cleanup()
  1208  					r.setState(structs.TaskStateDead, r.destroyEvent, false)
  1209  					return
  1210  				}
  1211  
  1212  				// Remove from consul before killing the task so that traffic
  1213  				// can be rerouted
  1214  				interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1215  				r.consul.RemoveTask(r.alloc.ID, interpTask)
  1216  
  1217  				// Delay actually killing the task if configured. See #244
  1218  				if r.task.ShutdownDelay > 0 {
  1219  					r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q",
  1220  						r.alloc.ID, r.task.Name, r.task.ShutdownDelay)
  1221  					<-time.After(r.task.ShutdownDelay)
  1222  				}
  1223  
  1224  				// Store the task event that provides context on the task
  1225  				// destroy. The Killed event is set from the alloc_runner and
  1226  				// doesn't add detail
  1227  				var killEvent *structs.TaskEvent
  1228  				if r.destroyEvent.Type != structs.TaskKilled {
  1229  					if r.destroyEvent.Type == structs.TaskKilling {
  1230  						killEvent = r.destroyEvent
  1231  					} else {
  1232  						r.setState(structs.TaskStateRunning, r.destroyEvent, false)
  1233  					}
  1234  				}
  1235  
  1236  				r.killTask(killEvent)
  1237  				close(stopCollection)
  1238  
  1239  				// Wait for handler to exit before calling cleanup
  1240  				<-handleWaitCh
  1241  				r.cleanup()
  1242  
  1243  				r.setState(structs.TaskStateDead, nil, false)
  1244  				return
  1245  			}
  1246  		}
  1247  
  1248  	RESTART:
  1249  		// shouldRestart will block if the task should restart after a delay.
  1250  		restart := r.shouldRestart()
  1251  		if !restart {
  1252  			r.cleanup()
  1253  			r.setState(structs.TaskStateDead, nil, false)
  1254  			return
  1255  		}
  1256  
  1257  		// Clear the handle so a new driver will be created.
  1258  		r.handleLock.Lock()
  1259  		r.handle = nil
  1260  		handleWaitCh = nil
  1261  		stopCollection = nil
  1262  		r.handleLock.Unlock()
  1263  	}
  1264  }
  1265  
  1266  // cleanup removes Consul entries and calls Driver.Cleanup when a task is
  1267  // stopping. Errors are logged.
  1268  func (r *TaskRunner) cleanup() {
  1269  	// Remove from Consul
  1270  	interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1271  	r.consul.RemoveTask(r.alloc.ID, interpTask)
  1272  
  1273  	drv, err := r.createDriver()
  1274  	if err != nil {
  1275  		r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err)
  1276  		return
  1277  	}
  1278  
  1279  	res := r.getCreatedResources()
  1280  
  1281  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1282  	attempts := 1
  1283  	var cleanupErr error
  1284  	for retry := true; retry; attempts++ {
  1285  		cleanupErr = drv.Cleanup(ctx, res)
  1286  		retry = structs.IsRecoverable(cleanupErr)
  1287  
  1288  		// Copy current createdResources state in case SaveState is
  1289  		// called between retries
  1290  		r.setCreatedResources(res)
  1291  
  1292  		// Retry 3 times with sleeps between
  1293  		if !retry || attempts > 3 {
  1294  			break
  1295  		}
  1296  		time.Sleep(time.Duration(attempts) * time.Second)
  1297  	}
  1298  
  1299  	if cleanupErr != nil {
  1300  		r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr)
  1301  	}
  1302  	return
  1303  }
  1304  
  1305  // shouldRestart returns if the task should restart. If the return value is
  1306  // true, the task's restart policy has already been considered and any wait time
  1307  // between restarts has been applied.
  1308  func (r *TaskRunner) shouldRestart() bool {
  1309  	state, when := r.restartTracker.GetState()
  1310  	reason := r.restartTracker.GetReason()
  1311  	switch state {
  1312  	case structs.TaskNotRestarting, structs.TaskTerminated:
  1313  		r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
  1314  		if state == structs.TaskNotRestarting {
  1315  			r.setState(structs.TaskStateDead,
  1316  				structs.NewTaskEvent(structs.TaskNotRestarting).
  1317  					SetRestartReason(reason).SetFailsTask(),
  1318  				false)
  1319  		}
  1320  		return false
  1321  	case structs.TaskRestarting:
  1322  		r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
  1323  		r.setState(structs.TaskStatePending,
  1324  			structs.NewTaskEvent(structs.TaskRestarting).
  1325  				SetRestartDelay(when).
  1326  				SetRestartReason(reason),
  1327  			false)
  1328  	default:
  1329  		r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
  1330  		return false
  1331  	}
  1332  
  1333  	// Unregister from Consul while waiting to restart.
  1334  	interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1335  	r.consul.RemoveTask(r.alloc.ID, interpTask)
  1336  
  1337  	// Sleep but watch for destroy events.
  1338  	select {
  1339  	case <-time.After(when):
  1340  	case <-r.destroyCh:
  1341  	}
  1342  
  1343  	// Destroyed while we were waiting to restart, so abort.
  1344  	r.destroyLock.Lock()
  1345  	destroyed := r.destroy
  1346  	r.destroyLock.Unlock()
  1347  	if destroyed {
  1348  		r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name)
  1349  		r.setState(structs.TaskStateDead, r.destroyEvent, false)
  1350  		return false
  1351  	}
  1352  
  1353  	return true
  1354  }
  1355  
  1356  // killTask kills the running task. A killing event can optionally be passed and
  1357  // this event is used to mark the task as being killed. It provides a means to
  1358  // store extra information.
  1359  func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) {
  1360  	r.runningLock.Lock()
  1361  	running := r.running
  1362  	r.runningLock.Unlock()
  1363  	if !running {
  1364  		return
  1365  	}
  1366  
  1367  	// Get the kill timeout
  1368  	timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)
  1369  
  1370  	// Build the event
  1371  	var event *structs.TaskEvent
  1372  	if killingEvent != nil {
  1373  		event = killingEvent
  1374  		event.Type = structs.TaskKilling
  1375  	} else {
  1376  		event = structs.NewTaskEvent(structs.TaskKilling)
  1377  	}
  1378  	event.SetKillTimeout(timeout)
  1379  
  1380  	// Mark that we received the kill event
  1381  	r.setState(structs.TaskStateRunning, event, false)
  1382  
  1383  	handle := r.getHandle()
  1384  
  1385  	// Kill the task using an exponential backoff in-case of failures.
  1386  	destroySuccess, err := r.handleDestroy(handle)
  1387  	if !destroySuccess {
  1388  		// We couldn't successfully destroy the resource created.
  1389  		r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
  1390  	}
  1391  
  1392  	r.runningLock.Lock()
  1393  	r.running = false
  1394  	r.runningLock.Unlock()
  1395  
  1396  	// Store that the task has been destroyed and any associated error.
  1397  	r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err), true)
  1398  }
  1399  
  1400  // startTask creates the driver, task dir, and starts the task.
  1401  func (r *TaskRunner) startTask() error {
  1402  	// Create a driver
  1403  	drv, err := r.createDriver()
  1404  	if err != nil {
  1405  		return fmt.Errorf("failed to create driver of task %q for alloc %q: %v",
  1406  			r.task.Name, r.alloc.ID, err)
  1407  	}
  1408  
  1409  	// Run prestart
  1410  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1411  	presp, err := drv.Prestart(ctx, r.task)
  1412  
  1413  	// Merge newly created resources into previously created resources
  1414  	if presp != nil {
  1415  		r.createdResourcesLock.Lock()
  1416  		r.createdResources.Merge(presp.CreatedResources)
  1417  		r.createdResourcesLock.Unlock()
  1418  
  1419  		// Set any network configuration returned by the driver
  1420  		r.envBuilder.SetDriverNetwork(presp.Network)
  1421  	}
  1422  
  1423  	if err != nil {
  1424  		wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v",
  1425  			r.task.Name, r.alloc.ID, err)
  1426  		r.logger.Printf("[WARN] client: error from prestart: %s", wrapped)
  1427  		return structs.WrapRecoverable(wrapped, err)
  1428  	}
  1429  
  1430  	// Create a new context for Start since the environment may have been updated.
  1431  	ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1432  
  1433  	// Start the job
  1434  	sresp, err := drv.Start(ctx, r.task)
  1435  	if err != nil {
  1436  		wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v",
  1437  			r.task.Name, r.alloc.ID, err)
  1438  		r.logger.Printf("[WARN] client: %s", wrapped)
  1439  		return structs.WrapRecoverable(wrapped, err)
  1440  
  1441  	}
  1442  
  1443  	// Log driver network information
  1444  	if sresp.Network != nil && sresp.Network.IP != "" {
  1445  		if sresp.Network.AutoAdvertise {
  1446  			r.logger.Printf("[INFO] client: alloc %s task %s auto-advertising detected IP %s",
  1447  				r.alloc.ID, r.task.Name, sresp.Network.IP)
  1448  		} else {
  1449  			r.logger.Printf("[TRACE] client: alloc %s task %s detected IP %s but not auto-advertising",
  1450  				r.alloc.ID, r.task.Name, sresp.Network.IP)
  1451  		}
  1452  	}
  1453  
  1454  	if sresp.Network == nil || sresp.Network.IP == "" {
  1455  		r.logger.Printf("[TRACE] client: alloc %s task %s could not detect a driver IP", r.alloc.ID, r.task.Name)
  1456  	}
  1457  
  1458  	// Update environment with the network defined by the driver's Start method.
  1459  	r.envBuilder.SetDriverNetwork(sresp.Network)
  1460  
  1461  	if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil {
  1462  		// All IO is done asynchronously, so errors from registering
  1463  		// services are hard failures.
  1464  		r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err)
  1465  
  1466  		// Kill the started task
  1467  		if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed {
  1468  			r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v",
  1469  				r.task.Name, r.alloc.ID, err)
  1470  		}
  1471  		return structs.NewRecoverableError(err, false)
  1472  	}
  1473  
  1474  	r.handleLock.Lock()
  1475  	r.handle = sresp.Handle
  1476  	r.handleLock.Unlock()
  1477  
  1478  	// Need to persist the driver network between restarts
  1479  	r.driverNetLock.Lock()
  1480  	r.driverNet = sresp.Network
  1481  	r.driverNetLock.Unlock()
  1482  
  1483  	return nil
  1484  }
  1485  
  1486  // registerServices and checks with Consul.
  1487  func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error {
  1488  	var exec driver.ScriptExecutor
  1489  	if d.Abilities().Exec {
  1490  		// Allow set the script executor if the driver supports it
  1491  		exec = h
  1492  	}
  1493  	interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
  1494  	return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, r, exec, n)
  1495  }
  1496  
  1497  // interpolateServices interpolates tags in a service and checks with values from the
  1498  // task's environment.
  1499  func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task {
  1500  	taskCopy := task.Copy()
  1501  	for _, service := range taskCopy.Services {
  1502  		for _, check := range service.Checks {
  1503  			check.Name = taskEnv.ReplaceEnv(check.Name)
  1504  			check.Type = taskEnv.ReplaceEnv(check.Type)
  1505  			check.Command = taskEnv.ReplaceEnv(check.Command)
  1506  			check.Args = taskEnv.ParseAndReplace(check.Args)
  1507  			check.Path = taskEnv.ReplaceEnv(check.Path)
  1508  			check.Protocol = taskEnv.ReplaceEnv(check.Protocol)
  1509  			check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel)
  1510  			check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus)
  1511  			check.Method = taskEnv.ReplaceEnv(check.Method)
  1512  			if len(check.Header) > 0 {
  1513  				header := make(map[string][]string, len(check.Header))
  1514  				for k, vs := range check.Header {
  1515  					newVals := make([]string, len(vs))
  1516  					for i, v := range vs {
  1517  						newVals[i] = taskEnv.ReplaceEnv(v)
  1518  					}
  1519  					header[taskEnv.ReplaceEnv(k)] = newVals
  1520  				}
  1521  				check.Header = header
  1522  			}
  1523  		}
  1524  		service.Name = taskEnv.ReplaceEnv(service.Name)
  1525  		service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel)
  1526  		service.Tags = taskEnv.ParseAndReplace(service.Tags)
  1527  	}
  1528  	return taskCopy
  1529  }
  1530  
  1531  // buildTaskDir creates the task directory before driver.Prestart. It is safe
  1532  // to call multiple times as its state is persisted.
  1533  func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error {
  1534  	r.persistLock.Lock()
  1535  	built := r.taskDirBuilt
  1536  	r.persistLock.Unlock()
  1537  
  1538  	// We do not set the state again since this only occurs during restoration
  1539  	// and the task dir is already built. The reason we call Build again is to
  1540  	// ensure that the task dir invariants are still held.
  1541  	if !built {
  1542  		r.setState(structs.TaskStatePending,
  1543  			structs.NewTaskEvent(structs.TaskSetup).SetMessage(structs.TaskBuildingTaskDir),
  1544  			false)
  1545  	}
  1546  
  1547  	chroot := config.DefaultChrootEnv
  1548  	if len(r.config.ChrootEnv) > 0 {
  1549  		chroot = r.config.ChrootEnv
  1550  	}
  1551  	if err := r.taskDir.Build(built, chroot, fsi); err != nil {
  1552  		return err
  1553  	}
  1554  
  1555  	// Mark task dir as successfully built
  1556  	r.persistLock.Lock()
  1557  	r.taskDirBuilt = true
  1558  	r.persistLock.Unlock()
  1559  
  1560  	// Set path and host related env vars
  1561  	driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config)
  1562  	return nil
  1563  }
  1564  
  1565  // collectResourceUsageStats starts collecting resource usage stats of a Task.
  1566  // Collection ends when the passed channel is closed
  1567  func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) {
  1568  	// start collecting the stats right away and then start collecting every
  1569  	// collection interval
  1570  	next := time.NewTimer(0)
  1571  	defer next.Stop()
  1572  	for {
  1573  		select {
  1574  		case <-next.C:
  1575  			next.Reset(r.config.StatsCollectionInterval)
  1576  			handle := r.getHandle()
  1577  			if handle == nil {
  1578  				continue
  1579  			}
  1580  			ru, err := handle.Stats()
  1581  
  1582  			if err != nil {
  1583  				// Check if the driver doesn't implement stats
  1584  				if err.Error() == driver.DriverStatsNotImplemented.Error() {
  1585  					r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID)
  1586  					return
  1587  				}
  1588  
  1589  				// We do not log when the plugin is shutdown as this is simply a
  1590  				// race between the stopCollection channel being closed and calling
  1591  				// Stats on the handle.
  1592  				if !strings.Contains(err.Error(), "connection is shut down") {
  1593  					r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err)
  1594  				}
  1595  				continue
  1596  			}
  1597  
  1598  			r.resourceUsageLock.Lock()
  1599  			r.resourceUsage = ru
  1600  			r.resourceUsageLock.Unlock()
  1601  			if ru != nil {
  1602  				r.emitStats(ru)
  1603  			}
  1604  		case <-stopCollection:
  1605  			return
  1606  		}
  1607  	}
  1608  }
  1609  
  1610  // LatestResourceUsage returns the last resource utilization datapoint collected
  1611  func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1612  	r.resourceUsageLock.RLock()
  1613  	defer r.resourceUsageLock.RUnlock()
  1614  	r.runningLock.Lock()
  1615  	defer r.runningLock.Unlock()
  1616  
  1617  	// If the task is not running there can be no latest resource
  1618  	if !r.running {
  1619  		return nil
  1620  	}
  1621  
  1622  	return r.resourceUsage
  1623  }
  1624  
  1625  // handleUpdate takes an updated allocation and updates internal state to
  1626  // reflect the new config for the task.
  1627  func (r *TaskRunner) handleUpdate(update *structs.Allocation) error {
  1628  	// Extract the task group from the alloc.
  1629  	tg := update.Job.LookupTaskGroup(update.TaskGroup)
  1630  	if tg == nil {
  1631  		return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup)
  1632  	}
  1633  
  1634  	// Extract the task.
  1635  	var updatedTask *structs.Task
  1636  	for _, t := range tg.Tasks {
  1637  		if t.Name == r.task.Name {
  1638  			updatedTask = t.Copy()
  1639  			break
  1640  		}
  1641  	}
  1642  	if updatedTask == nil {
  1643  		return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name)
  1644  	}
  1645  
  1646  	// Merge in the task resources
  1647  	updatedTask.Resources = update.TaskResources[updatedTask.Name]
  1648  
  1649  	// Interpolate the old task with the old env before updating the env as
  1650  	// updating services in Consul need both the old and new interpolations
  1651  	// to find differences.
  1652  	oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
  1653  
  1654  	// Now it's safe to update the environment
  1655  	r.envBuilder.UpdateTask(update, updatedTask)
  1656  
  1657  	var mErr multierror.Error
  1658  	r.handleLock.Lock()
  1659  	if r.handle != nil {
  1660  		drv, err := r.createDriver()
  1661  		if err != nil {
  1662  			// Something has really gone wrong; don't continue
  1663  			r.handleLock.Unlock()
  1664  			return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err)
  1665  		}
  1666  
  1667  		// Update will update resources and store the new kill timeout.
  1668  		if err := r.handle.Update(updatedTask); err != nil {
  1669  			mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err))
  1670  		}
  1671  
  1672  		// Update services in Consul
  1673  		newInterpolatedTask := interpolateServices(r.envBuilder.Build(), updatedTask)
  1674  		if err := r.updateServices(drv, r.handle, oldInterpolatedTask, newInterpolatedTask); err != nil {
  1675  			mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err))
  1676  		}
  1677  	}
  1678  	r.handleLock.Unlock()
  1679  
  1680  	// Update the restart policy.
  1681  	if r.restartTracker != nil {
  1682  		r.restartTracker.SetPolicy(tg.RestartPolicy)
  1683  	}
  1684  
  1685  	// Store the updated alloc.
  1686  	r.alloc = update
  1687  	r.task = updatedTask
  1688  	return mErr.ErrorOrNil()
  1689  }
  1690  
  1691  // updateServices and checks with Consul. Tasks must be interpolated!
  1692  func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, oldTask, newTask *structs.Task) error {
  1693  	var exec driver.ScriptExecutor
  1694  	if d.Abilities().Exec {
  1695  		// Allow set the script executor if the driver supports it
  1696  		exec = h
  1697  	}
  1698  	r.driverNetLock.Lock()
  1699  	net := r.driverNet.Copy()
  1700  	r.driverNetLock.Unlock()
  1701  	return r.consul.UpdateTask(r.alloc.ID, oldTask, newTask, r, exec, net)
  1702  }
  1703  
  1704  // handleDestroy kills the task handle. In the case that killing fails,
  1705  // handleDestroy will retry with an exponential backoff and will give up at a
  1706  // given limit. It returns whether the task was destroyed and the error
  1707  // associated with the last kill attempt.
  1708  func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) {
  1709  	// Cap the number of times we attempt to kill the task.
  1710  	for i := 0; i < killFailureLimit; i++ {
  1711  		if err = handle.Kill(); err != nil {
  1712  			// Calculate the new backoff
  1713  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
  1714  			if backoff > killBackoffLimit {
  1715  				backoff = killBackoffLimit
  1716  			}
  1717  
  1718  			r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v",
  1719  				r.task.Name, r.alloc.ID, backoff, err)
  1720  			time.Sleep(backoff)
  1721  		} else {
  1722  			// Kill was successful
  1723  			return true, nil
  1724  		}
  1725  	}
  1726  	return
  1727  }
  1728  
  1729  // Restart will restart the task.
  1730  func (r *TaskRunner) Restart(source, reason string, failure bool) {
  1731  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1732  	event := newTaskRestartEvent(reasonStr, failure)
  1733  
  1734  	select {
  1735  	case r.restartCh <- event:
  1736  	case <-r.waitCh:
  1737  	}
  1738  }
  1739  
  1740  // Signal will send a signal to the task
  1741  func (r *TaskRunner) Signal(source, reason string, s os.Signal) error {
  1742  
  1743  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1744  	event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr)
  1745  
  1746  	resCh := make(chan error)
  1747  	se := SignalEvent{
  1748  		s:      s,
  1749  		e:      event,
  1750  		result: resCh,
  1751  	}
  1752  
  1753  	select {
  1754  	case r.signalCh <- se:
  1755  	case <-r.waitCh:
  1756  	}
  1757  
  1758  	return <-resCh
  1759  }
  1760  
  1761  // Kill will kill a task and store the error, no longer restarting the task. If
  1762  // fail is set, the task is marked as having failed.
  1763  func (r *TaskRunner) Kill(source, reason string, fail bool) {
  1764  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1765  	event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr)
  1766  	if fail {
  1767  		event.SetFailsTask()
  1768  	}
  1769  
  1770  	r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr)
  1771  	r.Destroy(event)
  1772  }
  1773  
  1774  func (r *TaskRunner) EmitEvent(source, message string) {
  1775  	event := structs.NewTaskEvent(source).
  1776  		SetMessage(message)
  1777  	r.setState("", event, false)
  1778  	r.logger.Printf("[DEBUG] client: event from %q for task %q in alloc %q: %v",
  1779  		source, r.task.Name, r.alloc.ID, message)
  1780  }
  1781  
  1782  // UnblockStart unblocks the starting of the task. It currently assumes only
  1783  // consul-template will unblock
  1784  func (r *TaskRunner) UnblockStart(source string) {
  1785  	r.unblockLock.Lock()
  1786  	defer r.unblockLock.Unlock()
  1787  	if r.unblocked {
  1788  		return
  1789  	}
  1790  
  1791  	r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source)
  1792  	r.unblocked = true
  1793  	close(r.unblockCh)
  1794  }
  1795  
  1796  // Helper function for converting a WaitResult into a TaskTerminated event.
  1797  func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent {
  1798  	return structs.NewTaskEvent(structs.TaskTerminated).
  1799  		SetExitCode(res.ExitCode).
  1800  		SetSignal(res.Signal).
  1801  		SetExitMessage(res.Err)
  1802  }
  1803  
  1804  // Update is used to update the task of the context
  1805  func (r *TaskRunner) Update(update *structs.Allocation) {
  1806  	select {
  1807  	case r.updateCh <- update:
  1808  	default:
  1809  		r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
  1810  			r.task.Name, r.alloc.ID)
  1811  	}
  1812  }
  1813  
  1814  // Destroy is used to indicate that the task context should be destroyed. The
  1815  // event parameter provides a context for the destroy.
  1816  func (r *TaskRunner) Destroy(event *structs.TaskEvent) {
  1817  	r.destroyLock.Lock()
  1818  	defer r.destroyLock.Unlock()
  1819  
  1820  	if r.destroy {
  1821  		return
  1822  	}
  1823  	r.destroy = true
  1824  	r.destroyEvent = event
  1825  	close(r.destroyCh)
  1826  }
  1827  
  1828  // getCreatedResources returns the resources created by drivers. It will never
  1829  // return nil.
  1830  func (r *TaskRunner) getCreatedResources() *driver.CreatedResources {
  1831  	r.createdResourcesLock.Lock()
  1832  	if r.createdResources == nil {
  1833  		r.createdResources = driver.NewCreatedResources()
  1834  	}
  1835  	cr := r.createdResources.Copy()
  1836  	r.createdResourcesLock.Unlock()
  1837  
  1838  	return cr
  1839  }
  1840  
  1841  // setCreatedResources updates the resources created by drivers. If passed nil
  1842  // it will set createdResources to an initialized struct.
  1843  func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) {
  1844  	if cr == nil {
  1845  		cr = driver.NewCreatedResources()
  1846  	}
  1847  	r.createdResourcesLock.Lock()
  1848  	r.createdResources = cr.Copy()
  1849  	r.createdResourcesLock.Unlock()
  1850  }
  1851  
  1852  func (r *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) {
  1853  	if !r.config.DisableTaggedMetrics {
  1854  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1855  			float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
  1856  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1857  			float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
  1858  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"},
  1859  			float32(ru.ResourceUsage.MemoryStats.Cache), r.baseLabels)
  1860  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"},
  1861  			float32(ru.ResourceUsage.MemoryStats.Swap), r.baseLabels)
  1862  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"},
  1863  			float32(ru.ResourceUsage.MemoryStats.MaxUsage), r.baseLabels)
  1864  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"},
  1865  			float32(ru.ResourceUsage.MemoryStats.KernelUsage), r.baseLabels)
  1866  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"},
  1867  			float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), r.baseLabels)
  1868  	}
  1869  
  1870  	if r.config.BackwardsCompatibleMetrics {
  1871  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1872  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1873  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1874  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1875  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1876  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1877  	}
  1878  }
  1879  
  1880  func (r *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) {
  1881  	if !r.config.DisableTaggedMetrics {
  1882  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"},
  1883  			float32(ru.ResourceUsage.CpuStats.Percent), r.baseLabels)
  1884  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"},
  1885  			float32(ru.ResourceUsage.CpuStats.SystemMode), r.baseLabels)
  1886  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"},
  1887  			float32(ru.ResourceUsage.CpuStats.UserMode), r.baseLabels)
  1888  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"},
  1889  			float32(ru.ResourceUsage.CpuStats.ThrottledTime), r.baseLabels)
  1890  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"},
  1891  			float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), r.baseLabels)
  1892  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"},
  1893  			float32(ru.ResourceUsage.CpuStats.TotalTicks), r.baseLabels)
  1894  	}
  1895  
  1896  	if r.config.BackwardsCompatibleMetrics {
  1897  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1898  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1899  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1900  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1901  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1902  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1903  	}
  1904  }
  1905  
  1906  // emitStats emits resource usage stats of tasks to remote metrics collector
  1907  // sinks
  1908  func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1909  	if !r.config.PublishAllocationMetrics {
  1910  		return
  1911  	}
  1912  
  1913  	// If the task is not running don't emit anything
  1914  	r.runningLock.Lock()
  1915  	running := r.running
  1916  	r.runningLock.Unlock()
  1917  	if !running {
  1918  		return
  1919  	}
  1920  
  1921  	if ru.ResourceUsage.MemoryStats != nil {
  1922  		r.setGaugeForMemory(ru)
  1923  	}
  1924  
  1925  	if ru.ResourceUsage.CpuStats != nil {
  1926  		r.setGaugeForCPU(ru)
  1927  	}
  1928  }