github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/client/allocrunner/taskrunner/task_runner.go (about)

     1  package taskrunner
     2  
     3  import (
     4  	"bytes"
     5  	"crypto/md5"
     6  	"encoding/hex"
     7  	"fmt"
     8  	"io"
     9  	"io/ioutil"
    10  	"log"
    11  	"os"
    12  	"path/filepath"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	metrics "github.com/armon/go-metrics"
    18  	"github.com/boltdb/bolt"
    19  	"github.com/golang/snappy"
    20  	"github.com/hashicorp/consul-template/signals"
    21  	"github.com/hashicorp/go-multierror"
    22  	version "github.com/hashicorp/go-version"
    23  	"github.com/hashicorp/nomad/client/allocdir"
    24  	"github.com/hashicorp/nomad/client/allocrunner/getter"
    25  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts"
    26  	"github.com/hashicorp/nomad/client/config"
    27  	consulApi "github.com/hashicorp/nomad/client/consul"
    28  	"github.com/hashicorp/nomad/client/driver"
    29  	"github.com/hashicorp/nomad/client/state"
    30  	"github.com/hashicorp/nomad/client/vaultclient"
    31  	"github.com/hashicorp/nomad/command/agent/consul"
    32  	"github.com/hashicorp/nomad/nomad/structs"
    33  	"github.com/ugorji/go/codec"
    34  
    35  	"github.com/hashicorp/nomad/client/driver/env"
    36  	dstructs "github.com/hashicorp/nomad/client/driver/structs"
    37  	cstructs "github.com/hashicorp/nomad/client/structs"
    38  )
    39  
    40  const (
    41  	// killBackoffBaseline is the baseline time for exponential backoff while
    42  	// killing a task.
    43  	killBackoffBaseline = 5 * time.Second
    44  
    45  	// killBackoffLimit is the limit of the exponential backoff for killing
    46  	// the task.
    47  	killBackoffLimit = 2 * time.Minute
    48  
    49  	// killFailureLimit is how many times we will attempt to kill a task before
    50  	// giving up and potentially leaking resources.
    51  	killFailureLimit = 5
    52  
    53  	// vaultBackoffBaseline is the baseline time for exponential backoff when
    54  	// attempting to retrieve a Vault token
    55  	vaultBackoffBaseline = 5 * time.Second
    56  
    57  	// vaultBackoffLimit is the limit of the exponential backoff when attempting
    58  	// to retrieve a Vault token
    59  	vaultBackoffLimit = 3 * time.Minute
    60  
    61  	// vaultTokenFile is the name of the file holding the Vault token inside the
    62  	// task's secret directory
    63  	vaultTokenFile = "vault_token"
    64  )
    65  
    66  var (
    67  	// taskRunnerStateAllKey holds all the task runners state. At the moment
    68  	// there is no need to split it
    69  	taskRunnerStateAllKey = []byte("simple-all")
    70  )
    71  
    72  // taskRestartEvent wraps a TaskEvent with additional metadata to control
    73  // restart behavior.
    74  type taskRestartEvent struct {
    75  	// taskEvent to report
    76  	taskEvent *structs.TaskEvent
    77  
    78  	// if false, don't count against restart count
    79  	failure bool
    80  }
    81  
    82  func newTaskRestartEvent(reason string, failure bool) *taskRestartEvent {
    83  	return &taskRestartEvent{
    84  		taskEvent: structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason),
    85  		failure:   failure,
    86  	}
    87  }
    88  
    89  // TaskRunner is used to wrap a task within an allocation and provide the execution context.
    90  type TaskRunner struct {
    91  	stateDB        *bolt.DB
    92  	config         *config.Config
    93  	updater        TaskStateUpdater
    94  	logger         *log.Logger
    95  	restartTracker *restarts.RestartTracker
    96  	consul         consulApi.ConsulServiceAPI
    97  
    98  	// running marks whether the task is running
    99  	running     bool
   100  	runningLock sync.Mutex
   101  
   102  	resourceUsage     *cstructs.TaskResourceUsage
   103  	resourceUsageLock sync.RWMutex
   104  
   105  	alloc   *structs.Allocation
   106  	task    *structs.Task
   107  	taskDir *allocdir.TaskDir
   108  
   109  	// envBuilder is used to build the task's environment
   110  	envBuilder *env.Builder
   111  
   112  	// driverNet is the network information returned by the driver
   113  	driverNet     *cstructs.DriverNetwork
   114  	driverNetLock sync.Mutex
   115  
   116  	// updateCh is used to receive updated versions of the allocation
   117  	updateCh chan *structs.Allocation
   118  
   119  	handle     driver.DriverHandle
   120  	handleLock sync.Mutex
   121  
   122  	// artifactsDownloaded tracks whether the tasks artifacts have been
   123  	// downloaded
   124  	//
   125  	// Must acquire persistLock when accessing
   126  	artifactsDownloaded bool
   127  
   128  	// taskDirBuilt tracks whether the task has built its directory.
   129  	//
   130  	// Must acquire persistLock when accessing
   131  	taskDirBuilt bool
   132  
   133  	// createdResources are all the resources created by the task driver
   134  	// across all attempts to start the task.
   135  	// Simple gets and sets should use {get,set}CreatedResources
   136  	createdResources     *driver.CreatedResources
   137  	createdResourcesLock sync.Mutex
   138  
   139  	// payloadRendered tracks whether the payload has been rendered to disk
   140  	payloadRendered bool
   141  
   142  	// vaultFuture is the means to wait for and get a Vault token
   143  	vaultFuture *tokenFuture
   144  
   145  	// recoveredVaultToken is the token that was recovered through a restore
   146  	recoveredVaultToken string
   147  
   148  	// vaultClient is used to retrieve and renew any needed Vault token
   149  	vaultClient vaultclient.VaultClient
   150  
   151  	// templateManager is used to manage any consul-templates this task may have
   152  	templateManager *TaskTemplateManager
   153  
   154  	// startCh is used to trigger the start of the task
   155  	startCh chan struct{}
   156  
   157  	// unblockCh is used to unblock the starting of the task
   158  	unblockCh   chan struct{}
   159  	unblocked   bool
   160  	unblockLock sync.Mutex
   161  
   162  	// restartCh is used to restart a task
   163  	restartCh chan *taskRestartEvent
   164  
   165  	// signalCh is used to send a signal to a task
   166  	signalCh chan SignalEvent
   167  
   168  	destroy      bool
   169  	destroyCh    chan struct{}
   170  	destroyLock  sync.Mutex
   171  	destroyEvent *structs.TaskEvent
   172  
   173  	// waitCh closing marks the run loop as having exited
   174  	waitCh chan struct{}
   175  
   176  	// persistLock must be acquired when accessing fields stored by
   177  	// SaveState. SaveState is called asynchronously to TaskRunner.Run by
   178  	// AllocRunner, so all state fields must be synchronized using this
   179  	// lock.
   180  	persistLock sync.Mutex
   181  
   182  	// persistedHash is the hash of the last persisted snapshot. It is used to
   183  	// detect if a new snapshot has to be written to disk.
   184  	persistedHash []byte
   185  
   186  	// baseLabels are used when emitting tagged metrics. All task runner metrics
   187  	// will have these tags, and optionally more.
   188  	baseLabels []metrics.Label
   189  }
   190  
   191  // taskRunnerState is used to snapshot the state of the task runner
   192  type taskRunnerState struct {
   193  	Version            string
   194  	HandleID           string
   195  	ArtifactDownloaded bool
   196  	TaskDirBuilt       bool
   197  	PayloadRendered    bool
   198  	CreatedResources   *driver.CreatedResources
   199  	DriverNetwork      *cstructs.DriverNetwork
   200  }
   201  
   202  func (s *taskRunnerState) Hash() []byte {
   203  	h := md5.New()
   204  
   205  	io.WriteString(h, s.Version)
   206  	io.WriteString(h, s.HandleID)
   207  	io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded))
   208  	io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt))
   209  	io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered))
   210  	h.Write(s.CreatedResources.Hash())
   211  	h.Write(s.DriverNetwork.Hash())
   212  
   213  	return h.Sum(nil)
   214  }
   215  
   216  // TaskStateUpdater is used to signal that tasks state has changed. If lazySync
   217  // is set the event won't be immediately pushed to the server.
   218  type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent, lazySync bool)
   219  
   220  // SignalEvent is a tuple of the signal and the event generating it
   221  type SignalEvent struct {
   222  	// s is the signal to be sent
   223  	s os.Signal
   224  
   225  	// e is the task event generating the signal
   226  	e *structs.TaskEvent
   227  
   228  	// result should be used to send back the result of the signal
   229  	result chan<- error
   230  }
   231  
   232  // NewTaskRunner is used to create a new task context
   233  func NewTaskRunner(logger *log.Logger, config *config.Config,
   234  	stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir,
   235  	alloc *structs.Allocation, task *structs.Task,
   236  	vaultClient vaultclient.VaultClient, consulClient consulApi.ConsulServiceAPI) *TaskRunner {
   237  
   238  	// Merge in the task resources
   239  	task.Resources = alloc.TaskResources[task.Name]
   240  
   241  	// Build the restart tracker.
   242  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   243  	if tg == nil {
   244  		logger.Printf("[ERR] client: alloc %q for missing task group %q", alloc.ID, alloc.TaskGroup)
   245  		return nil
   246  	}
   247  	restartTracker := restarts.NewRestartTracker(tg.RestartPolicy, alloc.Job.Type)
   248  
   249  	// Initialize the environment builder
   250  	envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region)
   251  
   252  	tc := &TaskRunner{
   253  		config:           config,
   254  		stateDB:          stateDB,
   255  		updater:          updater,
   256  		logger:           logger,
   257  		restartTracker:   restartTracker,
   258  		alloc:            alloc,
   259  		task:             task,
   260  		taskDir:          taskDir,
   261  		envBuilder:       envBuilder,
   262  		createdResources: driver.NewCreatedResources(),
   263  		consul:           consulClient,
   264  		vaultClient:      vaultClient,
   265  		vaultFuture:      NewTokenFuture().Set(""),
   266  		updateCh:         make(chan *structs.Allocation, 64),
   267  		destroyCh:        make(chan struct{}),
   268  		waitCh:           make(chan struct{}),
   269  		startCh:          make(chan struct{}, 1),
   270  		unblockCh:        make(chan struct{}),
   271  		restartCh:        make(chan *taskRestartEvent),
   272  		signalCh:         make(chan SignalEvent),
   273  	}
   274  
   275  	tc.baseLabels = []metrics.Label{
   276  		{
   277  			Name:  "job",
   278  			Value: tc.alloc.Job.Name,
   279  		},
   280  		{
   281  			Name:  "task_group",
   282  			Value: tc.alloc.TaskGroup,
   283  		},
   284  		{
   285  			Name:  "alloc_id",
   286  			Value: tc.alloc.ID,
   287  		},
   288  		{
   289  			Name:  "task",
   290  			Value: tc.task.Name,
   291  		},
   292  	}
   293  
   294  	if tc.alloc.Job.ParentID != "" {
   295  		tc.baseLabels = append(tc.baseLabels, metrics.Label{
   296  			Name:  "parent_id",
   297  			Value: tc.alloc.Job.ParentID,
   298  		})
   299  		if strings.Contains(tc.alloc.Job.Name, "/dispatch-") {
   300  			tc.baseLabels = append(tc.baseLabels, metrics.Label{
   301  				Name:  "dispatch_id",
   302  				Value: strings.Split(tc.alloc.Job.Name, "/dispatch-")[1],
   303  			})
   304  		}
   305  		if strings.Contains(tc.alloc.Job.Name, "/periodic-") {
   306  			tc.baseLabels = append(tc.baseLabels, metrics.Label{
   307  				Name:  "periodic_id",
   308  				Value: strings.Split(tc.alloc.Job.Name, "/periodic-")[1],
   309  			})
   310  		}
   311  		return tc
   312  	}
   313  
   314  	return tc
   315  }
   316  
   317  // MarkReceived marks the task as received.
   318  func (r *TaskRunner) MarkReceived() {
   319  	// We lazy sync this since there will be a follow up message almost
   320  	// immediately.
   321  	r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived), true)
   322  }
   323  
   324  // WaitCh returns a channel to wait for termination
   325  func (r *TaskRunner) WaitCh() <-chan struct{} {
   326  	return r.waitCh
   327  }
   328  
   329  // getHandle returns the task's handle or nil
   330  func (r *TaskRunner) getHandle() driver.DriverHandle {
   331  	r.handleLock.Lock()
   332  	h := r.handle
   333  	r.handleLock.Unlock()
   334  	return h
   335  }
   336  
   337  // pre060StateFilePath returns the path to our state file that would have been
   338  // written pre v0.6.0
   339  // COMPAT: Remove in 0.7.0
   340  func (r *TaskRunner) pre060StateFilePath() string {
   341  	// Get the MD5 of the task name
   342  	hashVal := md5.Sum([]byte(r.task.Name))
   343  	hashHex := hex.EncodeToString(hashVal[:])
   344  	dirName := fmt.Sprintf("task-%s", hashHex)
   345  
   346  	// Generate the path
   347  	return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json")
   348  }
   349  
   350  // RestoreState is used to restore our state. If a non-empty string is returned
   351  // the task is restarted with the string as the reason. This is useful for
   352  // backwards incompatible upgrades that need to restart tasks with a new
   353  // executor.
   354  func (r *TaskRunner) RestoreState() (string, error) {
   355  	var snap taskRunnerState
   356  	err := r.stateDB.View(func(tx *bolt.Tx) error {
   357  		bkt, err := state.GetTaskBucket(tx, r.alloc.ID, r.task.Name)
   358  		if err != nil {
   359  			return fmt.Errorf("failed to get task bucket: %v", err)
   360  		}
   361  
   362  		if err := state.GetObject(bkt, taskRunnerStateAllKey, &snap); err != nil {
   363  			return fmt.Errorf("failed to read task runner state: %v", err)
   364  		}
   365  		return nil
   366  	})
   367  	if err != nil {
   368  		return "", err
   369  	}
   370  
   371  	// Restore fields from the snapshot
   372  	r.artifactsDownloaded = snap.ArtifactDownloaded
   373  	r.taskDirBuilt = snap.TaskDirBuilt
   374  	r.payloadRendered = snap.PayloadRendered
   375  	r.setCreatedResources(snap.CreatedResources)
   376  	r.driverNet = snap.DriverNetwork
   377  
   378  	if r.task.Vault != nil {
   379  		// Read the token from the secret directory
   380  		tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   381  		data, err := ioutil.ReadFile(tokenPath)
   382  		if err != nil {
   383  			if !os.IsNotExist(err) {
   384  				return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   385  			}
   386  
   387  			// Token file doesn't exist
   388  		} else {
   389  			// Store the recovered token
   390  			r.recoveredVaultToken = string(data)
   391  		}
   392  	}
   393  
   394  	// Restore the driver
   395  	restartReason := ""
   396  	if snap.HandleID != "" {
   397  		d, err := r.createDriver()
   398  		if err != nil {
   399  			return "", err
   400  		}
   401  
   402  		// Add the restored network driver to the environment
   403  		r.envBuilder.SetDriverNetwork(r.driverNet)
   404  
   405  		// Open a connection to the driver handle
   406  		ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
   407  		handle, err := d.Open(ctx, snap.HandleID)
   408  
   409  		// In the case it fails, we relaunch the task in the Run() method.
   410  		if err != nil {
   411  			r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v",
   412  				r.task.Name, r.alloc.ID, err)
   413  			return "", nil
   414  		}
   415  
   416  		if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) {
   417  			restartReason = pre06ScriptCheckReason
   418  		}
   419  
   420  		if err := r.registerServices(d, handle, r.driverNet); err != nil {
   421  			// Don't hard fail here as there's a chance this task
   422  			// registered with Consul properly when it initial
   423  			// started.
   424  			r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v",
   425  				r.task.Name, r.alloc.ID, err)
   426  		}
   427  
   428  		r.handleLock.Lock()
   429  		r.handle = handle
   430  		r.handleLock.Unlock()
   431  
   432  		r.runningLock.Lock()
   433  		r.running = true
   434  		r.runningLock.Unlock()
   435  	}
   436  	return restartReason, nil
   437  }
   438  
   439  // ver06 is used for checking for pre-0.6 script checks
   440  var ver06 = version.Must(version.NewVersion("0.6.0dev"))
   441  
   442  // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script
   443  // check is found on an exec/java task.
   444  const pre06ScriptCheckReason = "upgrading pre-0.6 script checks"
   445  
   446  // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script
   447  // check, and uses exec or java drivers.
   448  func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool {
   449  	if driver != "exec" && driver != "java" && driver != "mock_driver" {
   450  		// Only exec and java are affected
   451  		return false
   452  	}
   453  	v, err := version.NewVersion(ver)
   454  	if err != nil {
   455  		// Treat it as old
   456  		return true
   457  	}
   458  	if !v.LessThan(ver06) {
   459  		// >= 0.6.0dev
   460  		return false
   461  	}
   462  	for _, service := range services {
   463  		for _, check := range service.Checks {
   464  			if check.Type == "script" {
   465  				return true
   466  			}
   467  		}
   468  	}
   469  	return false
   470  }
   471  
   472  // SaveState is used to snapshot our state
   473  func (r *TaskRunner) SaveState() error {
   474  	r.destroyLock.Lock()
   475  	defer r.destroyLock.Unlock()
   476  	if r.destroy {
   477  		// Don't save state if already destroyed
   478  		return nil
   479  	}
   480  
   481  	r.persistLock.Lock()
   482  	defer r.persistLock.Unlock()
   483  	snap := taskRunnerState{
   484  		Version:            r.config.Version.VersionNumber(),
   485  		ArtifactDownloaded: r.artifactsDownloaded,
   486  		TaskDirBuilt:       r.taskDirBuilt,
   487  		PayloadRendered:    r.payloadRendered,
   488  		CreatedResources:   r.getCreatedResources(),
   489  	}
   490  
   491  	r.handleLock.Lock()
   492  	if r.handle != nil {
   493  		snap.HandleID = r.handle.ID()
   494  	}
   495  	r.handleLock.Unlock()
   496  
   497  	r.driverNetLock.Lock()
   498  	snap.DriverNetwork = r.driverNet.Copy()
   499  	r.driverNetLock.Unlock()
   500  
   501  	// If nothing has changed avoid the write
   502  	h := snap.Hash()
   503  	if bytes.Equal(h, r.persistedHash) {
   504  		return nil
   505  	}
   506  
   507  	// Serialize the object
   508  	var buf bytes.Buffer
   509  	if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil {
   510  		return fmt.Errorf("failed to serialize snapshot: %v", err)
   511  	}
   512  
   513  	// Start the transaction.
   514  	return r.stateDB.Batch(func(tx *bolt.Tx) error {
   515  		// Grab the task bucket
   516  		taskBkt, err := state.GetTaskBucket(tx, r.alloc.ID, r.task.Name)
   517  		if err != nil {
   518  			return fmt.Errorf("failed to retrieve allocation bucket: %v", err)
   519  		}
   520  
   521  		if err := state.PutData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil {
   522  			return fmt.Errorf("failed to write task_runner state: %v", err)
   523  		}
   524  
   525  		// Store the hash that was persisted
   526  		tx.OnCommit(func() {
   527  			r.persistedHash = h
   528  		})
   529  
   530  		return nil
   531  	})
   532  }
   533  
   534  // DestroyState is used to cleanup after ourselves
   535  func (r *TaskRunner) DestroyState() error {
   536  	r.persistLock.Lock()
   537  	defer r.persistLock.Unlock()
   538  
   539  	return r.stateDB.Update(func(tx *bolt.Tx) error {
   540  		if err := state.DeleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil {
   541  			return fmt.Errorf("failed to delete task bucket: %v", err)
   542  		}
   543  		return nil
   544  	})
   545  }
   546  
   547  // setState is used to update the state of the task runner
   548  func (r *TaskRunner) setState(state string, event *structs.TaskEvent, lazySync bool) {
   549  	event.PopulateEventDisplayMessage()
   550  
   551  	// Persist our state to disk.
   552  	if err := r.SaveState(); err != nil {
   553  		r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err)
   554  	}
   555  
   556  	// Indicate the task has been updated.
   557  	r.updater(r.task.Name, state, event, lazySync)
   558  }
   559  
   560  // createDriver makes a driver for the task
   561  func (r *TaskRunner) createDriver() (driver.Driver, error) {
   562  	// Create a task-specific event emitter callback to expose minimal
   563  	// state to drivers
   564  	eventEmitter := func(m string, args ...interface{}) {
   565  		msg := fmt.Sprintf(m, args...)
   566  		r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg)
   567  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg), false)
   568  	}
   569  
   570  	driverCtx := driver.NewDriverContext(r.alloc.Job.Name, r.alloc.TaskGroup, r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter)
   571  	d, err := driver.NewDriver(r.task.Driver, driverCtx)
   572  	if err != nil {
   573  		return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
   574  			r.task.Driver, r.alloc.ID, err)
   575  	}
   576  
   577  	return d, err
   578  }
   579  
   580  // Run is a long running routine used to manage the task
   581  func (r *TaskRunner) Run() {
   582  	defer close(r.waitCh)
   583  	r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
   584  		r.task.Name, r.alloc.ID)
   585  
   586  	if err := r.validateTask(); err != nil {
   587  		r.setState(
   588  			structs.TaskStateDead,
   589  			structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask(),
   590  			false)
   591  		return
   592  	}
   593  
   594  	// Create a temporary driver so that we can determine the FSIsolation
   595  	// required. run->startTask will create a new driver after environment
   596  	// has been setup (env vars, templates, artifacts, secrets, etc).
   597  	tmpDrv, err := r.createDriver()
   598  	if err != nil {
   599  		e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   600  		r.setState(
   601  			structs.TaskStateDead,
   602  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
   603  			false)
   604  		return
   605  	}
   606  
   607  	// Build base task directory structure regardless of FS isolation abilities.
   608  	// This needs to happen before we start the Vault manager and call prestart
   609  	// as both those can write to the task directories
   610  	if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil {
   611  		e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err)
   612  		r.setState(
   613  			structs.TaskStateDead,
   614  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
   615  			false)
   616  		return
   617  	}
   618  
   619  	// If there is no Vault policy leave the static future created in
   620  	// NewTaskRunner
   621  	if r.task.Vault != nil {
   622  		// Start the go-routine to get a Vault token
   623  		r.vaultFuture.Clear()
   624  		go r.vaultManager(r.recoveredVaultToken)
   625  	}
   626  
   627  	// Start the run loop
   628  	r.run()
   629  
   630  	// Do any cleanup necessary
   631  	r.postrun()
   632  
   633  	return
   634  }
   635  
   636  // validateTask validates the fields of the task and returns an error if the
   637  // task is invalid.
   638  func (r *TaskRunner) validateTask() error {
   639  	var mErr multierror.Error
   640  
   641  	// Validate the user.
   642  	unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist)
   643  	checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers)
   644  	if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch {
   645  		if _, unallowed := unallowedUsers[r.task.User]; unallowed {
   646  			mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User))
   647  		}
   648  	}
   649  
   650  	// Validate the artifacts
   651  	for i, artifact := range r.task.Artifacts {
   652  		// Verify the artifact doesn't escape the task directory.
   653  		if err := artifact.Validate(); err != nil {
   654  			// If this error occurs there is potentially a server bug or
   655  			// malicious, server spoofing.
   656  			r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v",
   657  				r.alloc.ID, r.task.Name, artifact, i, err)
   658  			mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err))
   659  		}
   660  	}
   661  
   662  	// Validate the Service names
   663  	taskEnv := r.envBuilder.Build()
   664  	for i, service := range r.task.Services {
   665  		name := taskEnv.ReplaceEnv(service.Name)
   666  		if err := service.ValidateName(name); err != nil {
   667  			mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err))
   668  		}
   669  	}
   670  
   671  	if len(mErr.Errors) == 1 {
   672  		return mErr.Errors[0]
   673  	}
   674  	return mErr.ErrorOrNil()
   675  }
   676  
   677  // tokenFuture stores the Vault token and allows consumers to block till a valid
   678  // token exists
   679  type tokenFuture struct {
   680  	waiting []chan struct{}
   681  	token   string
   682  	set     bool
   683  	m       sync.Mutex
   684  }
   685  
   686  // NewTokenFuture returns a new token future without any token set
   687  func NewTokenFuture() *tokenFuture {
   688  	return &tokenFuture{}
   689  }
   690  
   691  // Wait returns a channel that can be waited on. When this channel unblocks, a
   692  // valid token will be available via the Get method
   693  func (f *tokenFuture) Wait() <-chan struct{} {
   694  	f.m.Lock()
   695  	defer f.m.Unlock()
   696  
   697  	c := make(chan struct{})
   698  	if f.set {
   699  		close(c)
   700  		return c
   701  	}
   702  
   703  	f.waiting = append(f.waiting, c)
   704  	return c
   705  }
   706  
   707  // Set sets the token value and unblocks any caller of Wait
   708  func (f *tokenFuture) Set(token string) *tokenFuture {
   709  	f.m.Lock()
   710  	defer f.m.Unlock()
   711  
   712  	f.set = true
   713  	f.token = token
   714  	for _, w := range f.waiting {
   715  		close(w)
   716  	}
   717  	f.waiting = nil
   718  	return f
   719  }
   720  
   721  // Clear clears the set vault token.
   722  func (f *tokenFuture) Clear() *tokenFuture {
   723  	f.m.Lock()
   724  	defer f.m.Unlock()
   725  
   726  	f.token = ""
   727  	f.set = false
   728  	return f
   729  }
   730  
   731  // Get returns the set Vault token
   732  func (f *tokenFuture) Get() string {
   733  	f.m.Lock()
   734  	defer f.m.Unlock()
   735  	return f.token
   736  }
   737  
   738  // vaultManager should be called in a go-routine and manages the derivation,
   739  // renewal and handling of errors with the Vault token. The optional parameter
   740  // allows setting the initial Vault token. This is useful when the Vault token
   741  // is recovered off disk.
   742  func (r *TaskRunner) vaultManager(token string) {
   743  	// Helper for stopping token renewal
   744  	stopRenewal := func() {
   745  		if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil {
   746  			r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   747  		}
   748  	}
   749  
   750  	// updatedToken lets us store state between loops. If true, a new token
   751  	// has been retrieved and we need to apply the Vault change mode
   752  	var updatedToken bool
   753  
   754  OUTER:
   755  	for {
   756  		// Check if we should exit
   757  		select {
   758  		case <-r.waitCh:
   759  			stopRenewal()
   760  			return
   761  		default:
   762  		}
   763  
   764  		// Clear the token
   765  		r.vaultFuture.Clear()
   766  
   767  		// Check if there already is a token which can be the case for
   768  		// restoring the TaskRunner
   769  		if token == "" {
   770  			// Get a token
   771  			var exit bool
   772  			token, exit = r.deriveVaultToken()
   773  			if exit {
   774  				// Exit the manager
   775  				return
   776  			}
   777  
   778  			// Write the token to disk
   779  			if err := r.writeToken(token); err != nil {
   780  				e := fmt.Errorf("failed to write Vault token to disk")
   781  				r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err)
   782  				r.Kill("vault", e.Error(), true)
   783  				return
   784  			}
   785  		}
   786  
   787  		// Start the renewal process
   788  		renewCh, err := r.vaultClient.RenewToken(token, 30)
   789  
   790  		// An error returned means the token is not being renewed
   791  		if err != nil {
   792  			r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   793  			token = ""
   794  			goto OUTER
   795  		}
   796  
   797  		// The Vault token is valid now, so set it
   798  		r.vaultFuture.Set(token)
   799  
   800  		if updatedToken {
   801  			switch r.task.Vault.ChangeMode {
   802  			case structs.VaultChangeModeSignal:
   803  				s, err := signals.Parse(r.task.Vault.ChangeSignal)
   804  				if err != nil {
   805  					e := fmt.Errorf("failed to parse signal: %v", err)
   806  					r.logger.Printf("[ERR] client: %v", err)
   807  					r.Kill("vault", e.Error(), true)
   808  					return
   809  				}
   810  
   811  				if err := r.Signal("vault", "new Vault token acquired", s); err != nil {
   812  					r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   813  					r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true)
   814  					return
   815  				}
   816  			case structs.VaultChangeModeRestart:
   817  				const noFailure = false
   818  				r.Restart("vault", "new Vault token acquired", noFailure)
   819  			case structs.VaultChangeModeNoop:
   820  				fallthrough
   821  			default:
   822  				r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode)
   823  			}
   824  
   825  			// We have handled it
   826  			updatedToken = false
   827  
   828  			// Call the handler
   829  			r.updatedTokenHandler()
   830  		}
   831  
   832  		// Start watching for renewal errors
   833  		select {
   834  		case err := <-renewCh:
   835  			// Clear the token
   836  			token = ""
   837  			r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   838  			stopRenewal()
   839  
   840  			// Check if we have to do anything
   841  			if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop {
   842  				updatedToken = true
   843  			}
   844  		case <-r.waitCh:
   845  			stopRenewal()
   846  			return
   847  		}
   848  	}
   849  }
   850  
   851  // deriveVaultToken derives the Vault token using exponential backoffs. It
   852  // returns the Vault token and whether the manager should exit.
   853  func (r *TaskRunner) deriveVaultToken() (token string, exit bool) {
   854  	attempts := 0
   855  	for {
   856  		tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name})
   857  		if err == nil {
   858  			return tokens[r.task.Name], false
   859  		}
   860  
   861  		// Check if this is a server side error
   862  		if structs.IsServerSide(err) {
   863  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   864  				r.task.Name, r.alloc.ID, err)
   865  			r.Kill("vault", fmt.Sprintf("server error deriving vault token: %v", err), true)
   866  			return "", true
   867  		}
   868  		// Check if we can't recover from the error
   869  		if !structs.IsRecoverable(err) {
   870  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   871  				r.task.Name, r.alloc.ID, err)
   872  			r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true)
   873  			return "", true
   874  		}
   875  
   876  		// Handle the retry case
   877  		backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
   878  		if backoff > vaultBackoffLimit {
   879  			backoff = vaultBackoffLimit
   880  		}
   881  		r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v",
   882  			r.task.Name, r.alloc.ID, err, backoff)
   883  
   884  		attempts++
   885  
   886  		// Wait till retrying
   887  		select {
   888  		case <-r.waitCh:
   889  			return "", true
   890  		case <-time.After(backoff):
   891  		}
   892  	}
   893  }
   894  
   895  // writeToken writes the given token to disk
   896  func (r *TaskRunner) writeToken(token string) error {
   897  	tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   898  	if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil {
   899  		return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   900  	}
   901  
   902  	return nil
   903  }
   904  
   905  // updatedTokenHandler is called when a new Vault token is retrieved. Things
   906  // that rely on the token should be updated here.
   907  func (r *TaskRunner) updatedTokenHandler() {
   908  
   909  	// Update the tasks environment
   910  	r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env)
   911  
   912  	if r.templateManager != nil {
   913  		r.templateManager.Stop()
   914  
   915  		// Create a new templateManager
   916  		var err error
   917  		r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
   918  			Hooks:                r,
   919  			Templates:            r.task.Templates,
   920  			ClientConfig:         r.config,
   921  			VaultToken:           r.vaultFuture.Get(),
   922  			TaskDir:              r.taskDir.Dir,
   923  			EnvBuilder:           r.envBuilder,
   924  			MaxTemplateEventRate: DefaultMaxTemplateEventRate,
   925  		})
   926  
   927  		if err != nil {
   928  			err := fmt.Errorf("failed to build task's template manager: %v", err)
   929  			r.setState(structs.TaskStateDead,
   930  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   931  				false)
   932  			r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
   933  			r.Kill("vault", err.Error(), true)
   934  			return
   935  		}
   936  	}
   937  }
   938  
   939  // prestart handles life-cycle tasks that occur before the task has started.
   940  // Since it's run asynchronously with the main Run() loop the alloc & task are
   941  // passed in to avoid racing with updates.
   942  func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) {
   943  	if task.Vault != nil {
   944  		// Wait for the token
   945  		r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID)
   946  		tokenCh := r.vaultFuture.Wait()
   947  		select {
   948  		case <-tokenCh:
   949  		case <-r.waitCh:
   950  			resultCh <- false
   951  			return
   952  		}
   953  		r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID)
   954  		r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env)
   955  	}
   956  
   957  	// If the job is a dispatch job and there is a payload write it to disk
   958  	requirePayload := len(alloc.Job.Payload) != 0 &&
   959  		(r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "")
   960  	if !r.payloadRendered && requirePayload {
   961  		renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File)
   962  		decoded, err := snappy.Decode(nil, alloc.Job.Payload)
   963  		if err != nil {
   964  			r.setState(
   965  				structs.TaskStateDead,
   966  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   967  				false)
   968  			resultCh <- false
   969  			return
   970  		}
   971  
   972  		if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil {
   973  			r.setState(
   974  				structs.TaskStateDead,
   975  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   976  				false)
   977  			resultCh <- false
   978  			return
   979  		}
   980  
   981  		if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil {
   982  			r.setState(
   983  				structs.TaskStateDead,
   984  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   985  				false)
   986  			resultCh <- false
   987  			return
   988  		}
   989  
   990  		r.payloadRendered = true
   991  	}
   992  
   993  	for {
   994  		r.persistLock.Lock()
   995  		downloaded := r.artifactsDownloaded
   996  		r.persistLock.Unlock()
   997  
   998  		// Download the task's artifacts
   999  		if !downloaded && len(task.Artifacts) > 0 {
  1000  			r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts), false)
  1001  			taskEnv := r.envBuilder.Build()
  1002  			for _, artifact := range task.Artifacts {
  1003  				if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil {
  1004  					wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err)
  1005  					r.logger.Printf("[DEBUG] client: %v", wrapped)
  1006  					r.setState(structs.TaskStatePending,
  1007  						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped), false)
  1008  					r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err))
  1009  					goto RESTART
  1010  				}
  1011  			}
  1012  
  1013  			r.persistLock.Lock()
  1014  			r.artifactsDownloaded = true
  1015  			r.persistLock.Unlock()
  1016  		}
  1017  
  1018  		// We don't have to wait for any template
  1019  		if len(task.Templates) == 0 {
  1020  			// Send the start signal
  1021  			select {
  1022  			case r.startCh <- struct{}{}:
  1023  			default:
  1024  			}
  1025  
  1026  			resultCh <- true
  1027  			return
  1028  		}
  1029  
  1030  		// Build the template manager
  1031  		if r.templateManager == nil {
  1032  			var err error
  1033  			r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
  1034  				Hooks:                r,
  1035  				Templates:            r.task.Templates,
  1036  				ClientConfig:         r.config,
  1037  				VaultToken:           r.vaultFuture.Get(),
  1038  				TaskDir:              r.taskDir.Dir,
  1039  				EnvBuilder:           r.envBuilder,
  1040  				MaxTemplateEventRate: DefaultMaxTemplateEventRate,
  1041  			})
  1042  			if err != nil {
  1043  				err := fmt.Errorf("failed to build task's template manager: %v", err)
  1044  				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), false)
  1045  				r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err)
  1046  				resultCh <- false
  1047  				return
  1048  			}
  1049  		}
  1050  
  1051  		// Block for consul-template
  1052  		// TODO Hooks should register themselves as blocking and then we can
  1053  		// periodically enumerate what we are still blocked on
  1054  		select {
  1055  		case <-r.unblockCh:
  1056  			// Send the start signal
  1057  			select {
  1058  			case r.startCh <- struct{}{}:
  1059  			default:
  1060  			}
  1061  
  1062  			resultCh <- true
  1063  			return
  1064  		case <-r.waitCh:
  1065  			// The run loop has exited so exit too
  1066  			resultCh <- false
  1067  			return
  1068  		}
  1069  
  1070  	RESTART:
  1071  		restart := r.shouldRestart()
  1072  		if !restart {
  1073  			resultCh <- false
  1074  			return
  1075  		}
  1076  	}
  1077  }
  1078  
  1079  // postrun is used to do any cleanup that is necessary after exiting the runloop
  1080  func (r *TaskRunner) postrun() {
  1081  	// Stop the template manager
  1082  	if r.templateManager != nil {
  1083  		r.templateManager.Stop()
  1084  	}
  1085  }
  1086  
  1087  // run is the main run loop that handles starting the application, destroying
  1088  // it, restarts and signals.
  1089  func (r *TaskRunner) run() {
  1090  	// Predeclare things so we can jump to the RESTART
  1091  	var stopCollection chan struct{}
  1092  	var handleWaitCh chan *dstructs.WaitResult
  1093  
  1094  	// If we already have a handle, populate the stopCollection and handleWaitCh
  1095  	// to fix the invariant that it exists.
  1096  	handleEmpty := r.getHandle() == nil
  1097  
  1098  	if !handleEmpty {
  1099  		stopCollection = make(chan struct{})
  1100  		go r.collectResourceUsageStats(stopCollection)
  1101  		handleWaitCh = r.handle.WaitCh()
  1102  	}
  1103  
  1104  	for {
  1105  		// Do the prestart activities
  1106  		prestartResultCh := make(chan bool, 1)
  1107  		go r.prestart(r.alloc, r.task, prestartResultCh)
  1108  
  1109  	WAIT:
  1110  		for {
  1111  			select {
  1112  			case success := <-prestartResultCh:
  1113  				if !success {
  1114  					r.cleanup()
  1115  					r.setState(structs.TaskStateDead, nil, false)
  1116  					return
  1117  				}
  1118  			case <-r.startCh:
  1119  				// Start the task if not yet started or it is being forced. This logic
  1120  				// is necessary because in the case of a restore the handle already
  1121  				// exists.
  1122  				handleEmpty := r.getHandle() == nil
  1123  				if handleEmpty {
  1124  					startErr := r.startTask()
  1125  					r.restartTracker.SetStartError(startErr)
  1126  					if startErr != nil {
  1127  						r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr), true)
  1128  						goto RESTART
  1129  					}
  1130  
  1131  					// Mark the task as started
  1132  					r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted), false)
  1133  					r.runningLock.Lock()
  1134  					r.running = true
  1135  					r.runningLock.Unlock()
  1136  
  1137  					if stopCollection == nil {
  1138  						stopCollection = make(chan struct{})
  1139  						go r.collectResourceUsageStats(stopCollection)
  1140  					}
  1141  
  1142  					handleWaitCh = r.handle.WaitCh()
  1143  				}
  1144  
  1145  			case waitRes := <-handleWaitCh:
  1146  				if waitRes == nil {
  1147  					panic("nil wait")
  1148  				}
  1149  
  1150  				r.runningLock.Lock()
  1151  				r.running = false
  1152  				r.runningLock.Unlock()
  1153  
  1154  				// Stop collection of the task's resource usage
  1155  				close(stopCollection)
  1156  
  1157  				// Log whether the task was successful or not.
  1158  				r.restartTracker.SetWaitResult(waitRes)
  1159  				r.setState("", r.waitErrorToEvent(waitRes), true)
  1160  				if !waitRes.Successful() {
  1161  					r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes)
  1162  				} else {
  1163  					r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID)
  1164  				}
  1165  
  1166  				break WAIT
  1167  			case update := <-r.updateCh:
  1168  				if err := r.handleUpdate(update); err != nil {
  1169  					r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err)
  1170  				}
  1171  
  1172  			case se := <-r.signalCh:
  1173  				r.runningLock.Lock()
  1174  				running := r.running
  1175  				r.runningLock.Unlock()
  1176  				common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID)
  1177  				if !running {
  1178  					// Send no error
  1179  					r.logger.Printf("[DEBUG] client: skipping %s", common)
  1180  					se.result <- nil
  1181  					continue
  1182  				}
  1183  
  1184  				r.logger.Printf("[DEBUG] client: sending %s", common)
  1185  				r.setState(structs.TaskStateRunning, se.e, false)
  1186  
  1187  				res := r.handle.Signal(se.s)
  1188  				se.result <- res
  1189  
  1190  			case restartEvent := <-r.restartCh:
  1191  				r.runningLock.Lock()
  1192  				running := r.running
  1193  				r.runningLock.Unlock()
  1194  				common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID)
  1195  				if !running {
  1196  					r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common)
  1197  					continue
  1198  				}
  1199  
  1200  				r.logger.Printf("[DEBUG] client: restarting %s: %v", common, restartEvent.taskEvent.RestartReason)
  1201  				r.setState(structs.TaskStateRunning, restartEvent.taskEvent, false)
  1202  				r.killTask(nil)
  1203  
  1204  				close(stopCollection)
  1205  
  1206  				if handleWaitCh != nil {
  1207  					<-handleWaitCh
  1208  				}
  1209  
  1210  				r.restartTracker.SetRestartTriggered(restartEvent.failure)
  1211  				break WAIT
  1212  
  1213  			case <-r.destroyCh:
  1214  				r.runningLock.Lock()
  1215  				running := r.running
  1216  				r.runningLock.Unlock()
  1217  				if !running {
  1218  					r.cleanup()
  1219  					r.setState(structs.TaskStateDead, r.destroyEvent, false)
  1220  					return
  1221  				}
  1222  
  1223  				// Remove from consul before killing the task so that traffic
  1224  				// can be rerouted
  1225  				r.removeServices()
  1226  
  1227  				// Delay actually killing the task if configured. See #244
  1228  				if r.task.ShutdownDelay > 0 {
  1229  					r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q",
  1230  						r.alloc.ID, r.task.Name, r.task.ShutdownDelay)
  1231  					<-time.After(r.task.ShutdownDelay)
  1232  				}
  1233  
  1234  				// Store the task event that provides context on the task
  1235  				// destroy. The Killed event is set from the alloc_runner and
  1236  				// doesn't add detail
  1237  				var killEvent *structs.TaskEvent
  1238  				if r.destroyEvent.Type != structs.TaskKilled {
  1239  					if r.destroyEvent.Type == structs.TaskKilling {
  1240  						killEvent = r.destroyEvent
  1241  					} else {
  1242  						r.setState(structs.TaskStateRunning, r.destroyEvent, false)
  1243  					}
  1244  				}
  1245  
  1246  				r.killTask(killEvent)
  1247  				close(stopCollection)
  1248  
  1249  				// Wait for handler to exit before calling cleanup
  1250  				<-handleWaitCh
  1251  				r.cleanup()
  1252  
  1253  				r.setState(structs.TaskStateDead, nil, false)
  1254  				return
  1255  			}
  1256  		}
  1257  
  1258  	RESTART:
  1259  		// shouldRestart will block if the task should restart after a delay.
  1260  		restart := r.shouldRestart()
  1261  		if !restart {
  1262  			r.cleanup()
  1263  			r.setState(structs.TaskStateDead, nil, false)
  1264  			return
  1265  		}
  1266  
  1267  		// Clear the handle so a new driver will be created.
  1268  		r.handleLock.Lock()
  1269  		r.handle = nil
  1270  		handleWaitCh = nil
  1271  		stopCollection = nil
  1272  		r.handleLock.Unlock()
  1273  	}
  1274  }
  1275  
  1276  // cleanup removes Consul entries and calls Driver.Cleanup when a task is
  1277  // stopping. Errors are logged.
  1278  func (r *TaskRunner) cleanup() {
  1279  	// Remove from Consul
  1280  	r.removeServices()
  1281  
  1282  	drv, err := r.createDriver()
  1283  	if err != nil {
  1284  		r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err)
  1285  		return
  1286  	}
  1287  
  1288  	res := r.getCreatedResources()
  1289  
  1290  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1291  	attempts := 1
  1292  	var cleanupErr error
  1293  	for retry := true; retry; attempts++ {
  1294  		cleanupErr = drv.Cleanup(ctx, res)
  1295  		retry = structs.IsRecoverable(cleanupErr)
  1296  
  1297  		// Copy current createdResources state in case SaveState is
  1298  		// called between retries
  1299  		r.setCreatedResources(res)
  1300  
  1301  		// Retry 3 times with sleeps between
  1302  		if !retry || attempts > 3 {
  1303  			break
  1304  		}
  1305  		time.Sleep(time.Duration(attempts) * time.Second)
  1306  	}
  1307  
  1308  	if cleanupErr != nil {
  1309  		r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr)
  1310  	}
  1311  	return
  1312  }
  1313  
  1314  // shouldRestart returns if the task should restart. If the return value is
  1315  // true, the task's restart policy has already been considered and any wait time
  1316  // between restarts has been applied.
  1317  func (r *TaskRunner) shouldRestart() bool {
  1318  	state, when := r.restartTracker.GetState()
  1319  	reason := r.restartTracker.GetReason()
  1320  	switch state {
  1321  	case structs.TaskNotRestarting, structs.TaskTerminated:
  1322  		r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
  1323  		if state == structs.TaskNotRestarting {
  1324  			r.setState(structs.TaskStateDead,
  1325  				structs.NewTaskEvent(structs.TaskNotRestarting).
  1326  					SetRestartReason(reason).SetFailsTask(),
  1327  				false)
  1328  		}
  1329  		return false
  1330  	case structs.TaskRestarting:
  1331  		r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
  1332  		r.setState(structs.TaskStatePending,
  1333  			structs.NewTaskEvent(structs.TaskRestarting).
  1334  				SetRestartDelay(when).
  1335  				SetRestartReason(reason),
  1336  			false)
  1337  	default:
  1338  		r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
  1339  		return false
  1340  	}
  1341  
  1342  	// Unregister from Consul while waiting to restart.
  1343  	r.removeServices()
  1344  
  1345  	// Sleep but watch for destroy events.
  1346  	select {
  1347  	case <-time.After(when):
  1348  	case <-r.destroyCh:
  1349  	}
  1350  
  1351  	// Destroyed while we were waiting to restart, so abort.
  1352  	r.destroyLock.Lock()
  1353  	destroyed := r.destroy
  1354  	r.destroyLock.Unlock()
  1355  	if destroyed {
  1356  		r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name)
  1357  		r.setState(structs.TaskStateDead, r.destroyEvent, false)
  1358  		return false
  1359  	}
  1360  
  1361  	return true
  1362  }
  1363  
  1364  // killTask kills the running task. A killing event can optionally be passed and
  1365  // this event is used to mark the task as being killed. It provides a means to
  1366  // store extra information.
  1367  func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) {
  1368  	r.runningLock.Lock()
  1369  	running := r.running
  1370  	r.runningLock.Unlock()
  1371  	if !running {
  1372  		return
  1373  	}
  1374  
  1375  	// Get the kill timeout
  1376  	timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)
  1377  
  1378  	// Build the event
  1379  	var event *structs.TaskEvent
  1380  	if killingEvent != nil {
  1381  		event = killingEvent
  1382  		event.Type = structs.TaskKilling
  1383  	} else {
  1384  		event = structs.NewTaskEvent(structs.TaskKilling)
  1385  	}
  1386  	event.SetKillTimeout(timeout)
  1387  
  1388  	// Mark that we received the kill event
  1389  	r.setState(structs.TaskStateRunning, event, false)
  1390  
  1391  	handle := r.getHandle()
  1392  
  1393  	// Kill the task using an exponential backoff in-case of failures.
  1394  	destroySuccess, err := r.handleDestroy(handle)
  1395  	if !destroySuccess {
  1396  		// We couldn't successfully destroy the resource created.
  1397  		r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
  1398  	}
  1399  
  1400  	r.runningLock.Lock()
  1401  	r.running = false
  1402  	r.runningLock.Unlock()
  1403  
  1404  	// Store that the task has been destroyed and any associated error.
  1405  	r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err), true)
  1406  }
  1407  
  1408  // startTask creates the driver, task dir, and starts the task.
  1409  func (r *TaskRunner) startTask() error {
  1410  	// Create a driver
  1411  	drv, err := r.createDriver()
  1412  	if err != nil {
  1413  		return fmt.Errorf("failed to create driver of task %q for alloc %q: %v",
  1414  			r.task.Name, r.alloc.ID, err)
  1415  	}
  1416  
  1417  	// Run prestart
  1418  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1419  	presp, err := drv.Prestart(ctx, r.task)
  1420  
  1421  	// Merge newly created resources into previously created resources
  1422  	if presp != nil {
  1423  		r.createdResourcesLock.Lock()
  1424  		r.createdResources.Merge(presp.CreatedResources)
  1425  		r.createdResourcesLock.Unlock()
  1426  
  1427  		// Set any network configuration returned by the driver
  1428  		r.envBuilder.SetDriverNetwork(presp.Network)
  1429  	}
  1430  
  1431  	if err != nil {
  1432  		wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v",
  1433  			r.task.Name, r.alloc.ID, err)
  1434  		r.logger.Printf("[WARN] client: error from prestart: %s", wrapped)
  1435  		return structs.WrapRecoverable(wrapped, err)
  1436  	}
  1437  
  1438  	// Create a new context for Start since the environment may have been updated.
  1439  	ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1440  
  1441  	// Start the job
  1442  	sresp, err := drv.Start(ctx, r.task)
  1443  	if err != nil {
  1444  		wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v",
  1445  			r.task.Name, r.alloc.ID, err)
  1446  		r.logger.Printf("[WARN] client: %s", wrapped)
  1447  		return structs.WrapRecoverable(wrapped, err)
  1448  
  1449  	}
  1450  
  1451  	// Log driver network information
  1452  	if sresp.Network != nil && sresp.Network.IP != "" {
  1453  		if sresp.Network.AutoAdvertise {
  1454  			r.logger.Printf("[INFO] client: alloc %s task %s auto-advertising detected IP %s",
  1455  				r.alloc.ID, r.task.Name, sresp.Network.IP)
  1456  		} else {
  1457  			r.logger.Printf("[TRACE] client: alloc %s task %s detected IP %s but not auto-advertising",
  1458  				r.alloc.ID, r.task.Name, sresp.Network.IP)
  1459  		}
  1460  	}
  1461  
  1462  	if sresp.Network == nil || sresp.Network.IP == "" {
  1463  		r.logger.Printf("[TRACE] client: alloc %s task %s could not detect a driver IP", r.alloc.ID, r.task.Name)
  1464  	}
  1465  
  1466  	// Update environment with the network defined by the driver's Start method.
  1467  	r.envBuilder.SetDriverNetwork(sresp.Network)
  1468  
  1469  	if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil {
  1470  		// All IO is done asynchronously, so errors from registering
  1471  		// services are hard failures.
  1472  		r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err)
  1473  
  1474  		// Kill the started task
  1475  		if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed {
  1476  			r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v",
  1477  				r.task.Name, r.alloc.ID, err)
  1478  		}
  1479  		return structs.NewRecoverableError(err, false)
  1480  	}
  1481  
  1482  	r.handleLock.Lock()
  1483  	r.handle = sresp.Handle
  1484  	r.handleLock.Unlock()
  1485  
  1486  	// Need to persist the driver network between restarts
  1487  	r.driverNetLock.Lock()
  1488  	r.driverNet = sresp.Network
  1489  	r.driverNetLock.Unlock()
  1490  
  1491  	return nil
  1492  }
  1493  
  1494  // registerServices and checks with Consul.
  1495  func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error {
  1496  	var exec driver.ScriptExecutor
  1497  	if d.Abilities().Exec {
  1498  		// Allow set the script executor if the driver supports it
  1499  		exec = h
  1500  	}
  1501  	interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
  1502  	taskServices := consul.NewTaskServices(r.alloc, interpolatedTask, r, exec, n)
  1503  	return r.consul.RegisterTask(taskServices)
  1504  }
  1505  
  1506  // interpolateServices interpolates tags in a service and checks with values from the
  1507  // task's environment.
  1508  func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task {
  1509  	taskCopy := task.Copy()
  1510  	for _, service := range taskCopy.Services {
  1511  		for _, check := range service.Checks {
  1512  			check.Name = taskEnv.ReplaceEnv(check.Name)
  1513  			check.Type = taskEnv.ReplaceEnv(check.Type)
  1514  			check.Command = taskEnv.ReplaceEnv(check.Command)
  1515  			check.Args = taskEnv.ParseAndReplace(check.Args)
  1516  			check.Path = taskEnv.ReplaceEnv(check.Path)
  1517  			check.Protocol = taskEnv.ReplaceEnv(check.Protocol)
  1518  			check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel)
  1519  			check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus)
  1520  			check.Method = taskEnv.ReplaceEnv(check.Method)
  1521  			check.GRPCService = taskEnv.ReplaceEnv(check.GRPCService)
  1522  			if len(check.Header) > 0 {
  1523  				header := make(map[string][]string, len(check.Header))
  1524  				for k, vs := range check.Header {
  1525  					newVals := make([]string, len(vs))
  1526  					for i, v := range vs {
  1527  						newVals[i] = taskEnv.ReplaceEnv(v)
  1528  					}
  1529  					header[taskEnv.ReplaceEnv(k)] = newVals
  1530  				}
  1531  				check.Header = header
  1532  			}
  1533  		}
  1534  		service.Name = taskEnv.ReplaceEnv(service.Name)
  1535  		service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel)
  1536  		service.Tags = taskEnv.ParseAndReplace(service.Tags)
  1537  		service.CanaryTags = taskEnv.ParseAndReplace(service.CanaryTags)
  1538  	}
  1539  	return taskCopy
  1540  }
  1541  
  1542  // buildTaskDir creates the task directory before driver.Prestart. It is safe
  1543  // to call multiple times as its state is persisted.
  1544  func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error {
  1545  	r.persistLock.Lock()
  1546  	built := r.taskDirBuilt
  1547  	r.persistLock.Unlock()
  1548  
  1549  	// We do not set the state again since this only occurs during restoration
  1550  	// and the task dir is already built. The reason we call Build again is to
  1551  	// ensure that the task dir invariants are still held.
  1552  	if !built {
  1553  		r.setState(structs.TaskStatePending,
  1554  			structs.NewTaskEvent(structs.TaskSetup).SetMessage(structs.TaskBuildingTaskDir),
  1555  			false)
  1556  	}
  1557  
  1558  	chroot := config.DefaultChrootEnv
  1559  	if len(r.config.ChrootEnv) > 0 {
  1560  		chroot = r.config.ChrootEnv
  1561  	}
  1562  	if err := r.taskDir.Build(built, chroot, fsi); err != nil {
  1563  		return err
  1564  	}
  1565  
  1566  	// Mark task dir as successfully built
  1567  	r.persistLock.Lock()
  1568  	r.taskDirBuilt = true
  1569  	r.persistLock.Unlock()
  1570  
  1571  	// Set path and host related env vars
  1572  	driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config)
  1573  	return nil
  1574  }
  1575  
  1576  // collectResourceUsageStats starts collecting resource usage stats of a Task.
  1577  // Collection ends when the passed channel is closed
  1578  func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) {
  1579  	// start collecting the stats right away and then start collecting every
  1580  	// collection interval
  1581  	next := time.NewTimer(0)
  1582  	defer next.Stop()
  1583  	for {
  1584  		select {
  1585  		case <-next.C:
  1586  			next.Reset(r.config.StatsCollectionInterval)
  1587  			handle := r.getHandle()
  1588  			if handle == nil {
  1589  				continue
  1590  			}
  1591  			ru, err := handle.Stats()
  1592  
  1593  			if err != nil {
  1594  				// Check if the driver doesn't implement stats
  1595  				if err.Error() == driver.DriverStatsNotImplemented.Error() {
  1596  					r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID)
  1597  					return
  1598  				}
  1599  
  1600  				// We do not log when the plugin is shutdown as this is simply a
  1601  				// race between the stopCollection channel being closed and calling
  1602  				// Stats on the handle.
  1603  				if !strings.Contains(err.Error(), "connection is shut down") {
  1604  					r.logger.Printf("[DEBUG] client: error fetching stats of task %v: %v", r.task.Name, err)
  1605  				}
  1606  				continue
  1607  			}
  1608  
  1609  			r.resourceUsageLock.Lock()
  1610  			r.resourceUsage = ru
  1611  			r.resourceUsageLock.Unlock()
  1612  			if ru != nil {
  1613  				r.emitStats(ru)
  1614  			}
  1615  		case <-stopCollection:
  1616  			return
  1617  		}
  1618  	}
  1619  }
  1620  
  1621  // LatestResourceUsage returns the last resource utilization datapoint collected
  1622  func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1623  	r.resourceUsageLock.RLock()
  1624  	defer r.resourceUsageLock.RUnlock()
  1625  	r.runningLock.Lock()
  1626  	defer r.runningLock.Unlock()
  1627  
  1628  	// If the task is not running there can be no latest resource
  1629  	if !r.running {
  1630  		return nil
  1631  	}
  1632  
  1633  	return r.resourceUsage
  1634  }
  1635  
  1636  // handleUpdate takes an updated allocation and updates internal state to
  1637  // reflect the new config for the task.
  1638  func (r *TaskRunner) handleUpdate(update *structs.Allocation) error {
  1639  	// Extract the task group from the alloc.
  1640  	tg := update.Job.LookupTaskGroup(update.TaskGroup)
  1641  	if tg == nil {
  1642  		return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup)
  1643  	}
  1644  
  1645  	// Extract the task.
  1646  	var updatedTask *structs.Task
  1647  	for _, t := range tg.Tasks {
  1648  		if t.Name == r.task.Name {
  1649  			updatedTask = t.Copy()
  1650  			break
  1651  		}
  1652  	}
  1653  	if updatedTask == nil {
  1654  		return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name)
  1655  	}
  1656  
  1657  	// Merge in the task resources
  1658  	updatedTask.Resources = update.TaskResources[updatedTask.Name]
  1659  
  1660  	// Interpolate the old task with the old env before updating the env as
  1661  	// updating services in Consul need both the old and new interpolations
  1662  	// to find differences.
  1663  	oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
  1664  
  1665  	// Now it's safe to update the environment
  1666  	r.envBuilder.UpdateTask(update, updatedTask)
  1667  
  1668  	var mErr multierror.Error
  1669  	r.handleLock.Lock()
  1670  	if r.handle != nil {
  1671  		drv, err := r.createDriver()
  1672  		if err != nil {
  1673  			// Something has really gone wrong; don't continue
  1674  			r.handleLock.Unlock()
  1675  			return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err)
  1676  		}
  1677  
  1678  		// Update will update resources and store the new kill timeout.
  1679  		if err := r.handle.Update(updatedTask); err != nil {
  1680  			mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err))
  1681  		}
  1682  
  1683  		// Update services in Consul
  1684  		newInterpolatedTask := interpolateServices(r.envBuilder.Build(), updatedTask)
  1685  		if err := r.updateServices(drv, r.handle, r.alloc, oldInterpolatedTask, update, newInterpolatedTask); err != nil {
  1686  			mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err))
  1687  		}
  1688  	}
  1689  	r.handleLock.Unlock()
  1690  
  1691  	// Update the restart policy.
  1692  	if r.restartTracker != nil {
  1693  		r.restartTracker.SetPolicy(tg.RestartPolicy)
  1694  	}
  1695  
  1696  	// Store the updated alloc.
  1697  	r.alloc = update
  1698  	r.task = updatedTask
  1699  	return mErr.ErrorOrNil()
  1700  }
  1701  
  1702  // updateServices and checks with Consul. Tasks must be interpolated!
  1703  func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor,
  1704  	oldAlloc *structs.Allocation, oldTask *structs.Task,
  1705  	newAlloc *structs.Allocation, newTask *structs.Task) error {
  1706  
  1707  	var exec driver.ScriptExecutor
  1708  	if d.Abilities().Exec {
  1709  		// Allow set the script executor if the driver supports it
  1710  		exec = h
  1711  	}
  1712  	r.driverNetLock.Lock()
  1713  	net := r.driverNet.Copy()
  1714  	r.driverNetLock.Unlock()
  1715  	oldTaskServices := consul.NewTaskServices(oldAlloc, oldTask, r, exec, net)
  1716  	newTaskServices := consul.NewTaskServices(newAlloc, newTask, r, exec, net)
  1717  	return r.consul.UpdateTask(oldTaskServices, newTaskServices)
  1718  }
  1719  
  1720  // removeServices and checks from Consul. Handles interpolation and deleting
  1721  // Canary=true and Canary=false versions in case Canary=false is set at the
  1722  // same time as the alloc is stopped.
  1723  func (r *TaskRunner) removeServices() {
  1724  	interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1725  	taskServices := consul.NewTaskServices(r.alloc, interpTask, r, nil, nil)
  1726  	r.consul.RemoveTask(taskServices)
  1727  
  1728  	// Flip Canary and remove again in case canary is getting flipped at
  1729  	// the same time as the alloc is being destroyed
  1730  	taskServices.Canary = !taskServices.Canary
  1731  	r.consul.RemoveTask(taskServices)
  1732  }
  1733  
  1734  // handleDestroy kills the task handle. In the case that killing fails,
  1735  // handleDestroy will retry with an exponential backoff and will give up at a
  1736  // given limit. It returns whether the task was destroyed and the error
  1737  // associated with the last kill attempt.
  1738  func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) {
  1739  	// Cap the number of times we attempt to kill the task.
  1740  	for i := 0; i < killFailureLimit; i++ {
  1741  		if err = handle.Kill(); err != nil {
  1742  			// Calculate the new backoff
  1743  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
  1744  			if backoff > killBackoffLimit {
  1745  				backoff = killBackoffLimit
  1746  			}
  1747  
  1748  			r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v",
  1749  				r.task.Name, r.alloc.ID, backoff, err)
  1750  			time.Sleep(backoff)
  1751  		} else {
  1752  			// Kill was successful
  1753  			return true, nil
  1754  		}
  1755  	}
  1756  	return
  1757  }
  1758  
  1759  // Restart will restart the task.
  1760  func (r *TaskRunner) Restart(source, reason string, failure bool) {
  1761  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1762  	event := newTaskRestartEvent(reasonStr, failure)
  1763  
  1764  	select {
  1765  	case r.restartCh <- event:
  1766  	case <-r.waitCh:
  1767  	}
  1768  }
  1769  
  1770  // Signal will send a signal to the task
  1771  func (r *TaskRunner) Signal(source, reason string, s os.Signal) error {
  1772  
  1773  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1774  	event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr)
  1775  
  1776  	resCh := make(chan error)
  1777  	se := SignalEvent{
  1778  		s:      s,
  1779  		e:      event,
  1780  		result: resCh,
  1781  	}
  1782  
  1783  	select {
  1784  	case r.signalCh <- se:
  1785  	case <-r.waitCh:
  1786  	}
  1787  
  1788  	return <-resCh
  1789  }
  1790  
  1791  // Kill will kill a task and store the error, no longer restarting the task. If
  1792  // fail is set, the task is marked as having failed.
  1793  func (r *TaskRunner) Kill(source, reason string, fail bool) {
  1794  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1795  	event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr)
  1796  	if fail {
  1797  		event.SetFailsTask()
  1798  	}
  1799  
  1800  	r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr)
  1801  	r.Destroy(event)
  1802  }
  1803  
  1804  func (r *TaskRunner) EmitEvent(source, message string) {
  1805  	event := structs.NewTaskEvent(source).
  1806  		SetMessage(message)
  1807  	r.setState("", event, false)
  1808  	r.logger.Printf("[DEBUG] client: event from %q for task %q in alloc %q: %v",
  1809  		source, r.task.Name, r.alloc.ID, message)
  1810  }
  1811  
  1812  // UnblockStart unblocks the starting of the task. It currently assumes only
  1813  // consul-template will unblock
  1814  func (r *TaskRunner) UnblockStart(source string) {
  1815  	r.unblockLock.Lock()
  1816  	defer r.unblockLock.Unlock()
  1817  	if r.unblocked {
  1818  		return
  1819  	}
  1820  
  1821  	r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source)
  1822  	r.unblocked = true
  1823  	close(r.unblockCh)
  1824  }
  1825  
  1826  // Helper function for converting a WaitResult into a TaskTerminated event.
  1827  func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent {
  1828  	return structs.NewTaskEvent(structs.TaskTerminated).
  1829  		SetExitCode(res.ExitCode).
  1830  		SetSignal(res.Signal).
  1831  		SetExitMessage(res.Err)
  1832  }
  1833  
  1834  // Update is used to update the task of the context
  1835  func (r *TaskRunner) Update(update *structs.Allocation) {
  1836  	select {
  1837  	case r.updateCh <- update:
  1838  	default:
  1839  		r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
  1840  			r.task.Name, r.alloc.ID)
  1841  	}
  1842  }
  1843  
  1844  // Destroy is used to indicate that the task context should be destroyed. The
  1845  // event parameter provides a context for the destroy.
  1846  func (r *TaskRunner) Destroy(event *structs.TaskEvent) {
  1847  	r.destroyLock.Lock()
  1848  	defer r.destroyLock.Unlock()
  1849  
  1850  	if r.destroy {
  1851  		return
  1852  	}
  1853  	r.destroy = true
  1854  	r.destroyEvent = event
  1855  	close(r.destroyCh)
  1856  }
  1857  
  1858  // getCreatedResources returns the resources created by drivers. It will never
  1859  // return nil.
  1860  func (r *TaskRunner) getCreatedResources() *driver.CreatedResources {
  1861  	r.createdResourcesLock.Lock()
  1862  	if r.createdResources == nil {
  1863  		r.createdResources = driver.NewCreatedResources()
  1864  	}
  1865  	cr := r.createdResources.Copy()
  1866  	r.createdResourcesLock.Unlock()
  1867  
  1868  	return cr
  1869  }
  1870  
  1871  // setCreatedResources updates the resources created by drivers. If passed nil
  1872  // it will set createdResources to an initialized struct.
  1873  func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) {
  1874  	if cr == nil {
  1875  		cr = driver.NewCreatedResources()
  1876  	}
  1877  	r.createdResourcesLock.Lock()
  1878  	r.createdResources = cr.Copy()
  1879  	r.createdResourcesLock.Unlock()
  1880  }
  1881  
  1882  func (r *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) {
  1883  	if !r.config.DisableTaggedMetrics {
  1884  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1885  			float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
  1886  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1887  			float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
  1888  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"},
  1889  			float32(ru.ResourceUsage.MemoryStats.Cache), r.baseLabels)
  1890  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"},
  1891  			float32(ru.ResourceUsage.MemoryStats.Swap), r.baseLabels)
  1892  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"},
  1893  			float32(ru.ResourceUsage.MemoryStats.MaxUsage), r.baseLabels)
  1894  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"},
  1895  			float32(ru.ResourceUsage.MemoryStats.KernelUsage), r.baseLabels)
  1896  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"},
  1897  			float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), r.baseLabels)
  1898  	}
  1899  
  1900  	if r.config.BackwardsCompatibleMetrics {
  1901  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1902  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1903  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1904  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1905  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1906  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1907  	}
  1908  }
  1909  
  1910  func (r *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) {
  1911  	if !r.config.DisableTaggedMetrics {
  1912  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"},
  1913  			float32(ru.ResourceUsage.CpuStats.Percent), r.baseLabels)
  1914  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"},
  1915  			float32(ru.ResourceUsage.CpuStats.SystemMode), r.baseLabels)
  1916  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"},
  1917  			float32(ru.ResourceUsage.CpuStats.UserMode), r.baseLabels)
  1918  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"},
  1919  			float32(ru.ResourceUsage.CpuStats.ThrottledTime), r.baseLabels)
  1920  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"},
  1921  			float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), r.baseLabels)
  1922  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"},
  1923  			float32(ru.ResourceUsage.CpuStats.TotalTicks), r.baseLabels)
  1924  	}
  1925  
  1926  	if r.config.BackwardsCompatibleMetrics {
  1927  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1928  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1929  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1930  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1931  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1932  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1933  	}
  1934  }
  1935  
  1936  // emitStats emits resource usage stats of tasks to remote metrics collector
  1937  // sinks
  1938  func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1939  	if !r.config.PublishAllocationMetrics {
  1940  		return
  1941  	}
  1942  
  1943  	// If the task is not running don't emit anything
  1944  	r.runningLock.Lock()
  1945  	running := r.running
  1946  	r.runningLock.Unlock()
  1947  	if !running {
  1948  		return
  1949  	}
  1950  
  1951  	if ru.ResourceUsage.MemoryStats != nil {
  1952  		r.setGaugeForMemory(ru)
  1953  	}
  1954  
  1955  	if ru.ResourceUsage.CpuStats != nil {
  1956  		r.setGaugeForCPU(ru)
  1957  	}
  1958  }