github.com/ncodes/nomad@v0.5.7-0.20170403112158-97adf4a74fb3/client/task_runner.go (about)

     1  package client
     2  
     3  import (
     4  	"crypto/md5"
     5  	"encoding/hex"
     6  	"fmt"
     7  	"io/ioutil"
     8  	"log"
     9  	"os"
    10  	"path/filepath"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/armon/go-metrics"
    17  	"github.com/golang/snappy"
    18  	"github.com/hashicorp/consul-template/signals"
    19  	"github.com/hashicorp/go-multierror"
    20  	"github.com/ncodes/nomad/client/allocdir"
    21  	"github.com/ncodes/nomad/client/config"
    22  	"github.com/ncodes/nomad/client/driver"
    23  	"github.com/ncodes/nomad/client/getter"
    24  	"github.com/ncodes/nomad/client/vaultclient"
    25  	"github.com/ncodes/nomad/nomad/structs"
    26  
    27  	"github.com/ncodes/nomad/client/driver/env"
    28  	dstructs "github.com/ncodes/nomad/client/driver/structs"
    29  	cstructs "github.com/ncodes/nomad/client/structs"
    30  )
    31  
    32  const (
    33  	// killBackoffBaseline is the baseline time for exponential backoff while
    34  	// killing a task.
    35  	killBackoffBaseline = 5 * time.Second
    36  
    37  	// killBackoffLimit is the limit of the exponential backoff for killing
    38  	// the task.
    39  	killBackoffLimit = 2 * time.Minute
    40  
    41  	// killFailureLimit is how many times we will attempt to kill a task before
    42  	// giving up and potentially leaking resources.
    43  	killFailureLimit = 5
    44  
    45  	// vaultBackoffBaseline is the baseline time for exponential backoff when
    46  	// attempting to retrieve a Vault token
    47  	vaultBackoffBaseline = 5 * time.Second
    48  
    49  	// vaultBackoffLimit is the limit of the exponential backoff when attempting
    50  	// to retrieve a Vault token
    51  	vaultBackoffLimit = 3 * time.Minute
    52  
    53  	// vaultTokenFile is the name of the file holding the Vault token inside the
    54  	// task's secret directory
    55  	vaultTokenFile = "vault_token"
    56  )
    57  
    58  // TaskRunner is used to wrap a task within an allocation and provide the execution context.
    59  type TaskRunner struct {
    60  	config         *config.Config
    61  	updater        TaskStateUpdater
    62  	logger         *log.Logger
    63  	alloc          *structs.Allocation
    64  	restartTracker *RestartTracker
    65  
    66  	// running marks whether the task is running
    67  	running     bool
    68  	runningLock sync.Mutex
    69  
    70  	resourceUsage     *cstructs.TaskResourceUsage
    71  	resourceUsageLock sync.RWMutex
    72  
    73  	task    *structs.Task
    74  	taskDir *allocdir.TaskDir
    75  
    76  	// taskEnv is the environment variables of the task
    77  	taskEnv     *env.TaskEnvironment
    78  	taskEnvLock sync.Mutex
    79  
    80  	// updateCh is used to receive updated versions of the allocation
    81  	updateCh chan *structs.Allocation
    82  
    83  	handle     driver.DriverHandle
    84  	handleLock sync.Mutex
    85  
    86  	// artifactsDownloaded tracks whether the tasks artifacts have been
    87  	// downloaded
    88  	//
    89  	// Must acquire persistLock when accessing
    90  	artifactsDownloaded bool
    91  
    92  	// taskDirBuilt tracks whether the task has built its directory.
    93  	//
    94  	// Must acquire persistLock when accessing
    95  	taskDirBuilt bool
    96  
    97  	// createdResources are all the resources created by the task driver
    98  	// across all attempts to start the task.
    99  	// Simple gets and sets should use {get,set}CreatedResources
   100  	createdResources     *driver.CreatedResources
   101  	createdResourcesLock sync.Mutex
   102  
   103  	// payloadRendered tracks whether the payload has been rendered to disk
   104  	payloadRendered bool
   105  
   106  	// vaultFuture is the means to wait for and get a Vault token
   107  	vaultFuture *tokenFuture
   108  
   109  	// recoveredVaultToken is the token that was recovered through a restore
   110  	recoveredVaultToken string
   111  
   112  	// vaultClient is used to retrieve and renew any needed Vault token
   113  	vaultClient vaultclient.VaultClient
   114  
   115  	// templateManager is used to manage any consul-templates this task may have
   116  	templateManager *TaskTemplateManager
   117  
   118  	// startCh is used to trigger the start of the task
   119  	startCh chan struct{}
   120  
   121  	// unblockCh is used to unblock the starting of the task
   122  	unblockCh   chan struct{}
   123  	unblocked   bool
   124  	unblockLock sync.Mutex
   125  
   126  	// restartCh is used to restart a task
   127  	restartCh chan *structs.TaskEvent
   128  
   129  	// signalCh is used to send a signal to a task
   130  	signalCh chan SignalEvent
   131  
   132  	destroy      bool
   133  	destroyCh    chan struct{}
   134  	destroyLock  sync.Mutex
   135  	destroyEvent *structs.TaskEvent
   136  
   137  	// waitCh closing marks the run loop as having exited
   138  	waitCh chan struct{}
   139  
   140  	// persistLock must be acquired when accessing fields stored by
   141  	// SaveState. SaveState is called asynchronously to TaskRunner.Run by
   142  	// AllocRunner, so all state fields must be synchronized using this
   143  	// lock.
   144  	persistLock sync.Mutex
   145  
   146  	// taskRunnerPlus container unofficial features
   147  	taskRunnerPlus *TaskRunnerPlus
   148  }
   149  
   150  // taskRunnerState is used to snapshot the state of the task runner
   151  type taskRunnerState struct {
   152  	Version            string
   153  	Task               *structs.Task
   154  	HandleID           string
   155  	ArtifactDownloaded bool
   156  	TaskDirBuilt       bool
   157  	CreatedResources   *driver.CreatedResources
   158  	PayloadRendered    bool
   159  }
   160  
   161  // TaskStateUpdater is used to signal that tasks state has changed.
   162  type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent)
   163  
   164  // SignalEvent is a tuple of the signal and the event generating it
   165  type SignalEvent struct {
   166  	// s is the signal to be sent
   167  	s os.Signal
   168  
   169  	// e is the task event generating the signal
   170  	e *structs.TaskEvent
   171  
   172  	// result should be used to send back the result of the signal
   173  	result chan<- error
   174  }
   175  
   176  // NewTaskRunner is used to create a new task context
   177  func NewTaskRunner(logger *log.Logger, config *config.Config,
   178  	updater TaskStateUpdater, taskDir *allocdir.TaskDir,
   179  	alloc *structs.Allocation, task *structs.Task,
   180  	vaultClient vaultclient.VaultClient) *TaskRunner {
   181  
   182  	// Merge in the task resources
   183  	task.Resources = alloc.TaskResources[task.Name]
   184  
   185  	// Build the restart tracker.
   186  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   187  	if tg == nil {
   188  		logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup)
   189  		return nil
   190  	}
   191  	restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type)
   192  
   193  	tc := &TaskRunner{
   194  		config:           config,
   195  		updater:          updater,
   196  		logger:           logger,
   197  		restartTracker:   restartTracker,
   198  		alloc:            alloc,
   199  		task:             task,
   200  		taskDir:          taskDir,
   201  		createdResources: driver.NewCreatedResources(),
   202  		vaultClient:      vaultClient,
   203  		vaultFuture:      NewTokenFuture().Set(""),
   204  		updateCh:         make(chan *structs.Allocation, 64),
   205  		destroyCh:        make(chan struct{}),
   206  		waitCh:           make(chan struct{}),
   207  		startCh:          make(chan struct{}, 1),
   208  		unblockCh:        make(chan struct{}),
   209  		restartCh:        make(chan *structs.TaskEvent),
   210  		signalCh:         make(chan SignalEvent),
   211  		taskRunnerPlus:   NewTaskRunnerPlus(logger, task.Env),
   212  	}
   213  
   214  	return tc
   215  }
   216  
   217  // MarkReceived marks the task as received.
   218  func (r *TaskRunner) MarkReceived() {
   219  	r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived))
   220  }
   221  
   222  // WaitCh returns a channel to wait for termination
   223  func (r *TaskRunner) WaitCh() <-chan struct{} {
   224  	return r.waitCh
   225  }
   226  
   227  // stateFilePath returns the path to our state file
   228  func (r *TaskRunner) stateFilePath() string {
   229  	// Get the MD5 of the task name
   230  	hashVal := md5.Sum([]byte(r.task.Name))
   231  	hashHex := hex.EncodeToString(hashVal[:])
   232  	dirName := fmt.Sprintf("task-%s", hashHex)
   233  
   234  	// Generate the path
   235  	path := filepath.Join(r.config.StateDir, "alloc", r.alloc.ID,
   236  		dirName, "state.json")
   237  	return path
   238  }
   239  
   240  // RestoreState is used to restore our state
   241  func (r *TaskRunner) RestoreState() error {
   242  	// Load the snapshot
   243  	var snap taskRunnerState
   244  	if err := restoreState(r.stateFilePath(), &snap); err != nil {
   245  		return err
   246  	}
   247  
   248  	// Restore fields
   249  	if snap.Task == nil {
   250  		return fmt.Errorf("task runner snapshot includes nil Task")
   251  	} else {
   252  		r.task = snap.Task
   253  	}
   254  	r.artifactsDownloaded = snap.ArtifactDownloaded
   255  	r.taskDirBuilt = snap.TaskDirBuilt
   256  	r.payloadRendered = snap.PayloadRendered
   257  
   258  	r.setCreatedResources(snap.CreatedResources)
   259  
   260  	if err := r.setTaskEnv(); err != nil {
   261  		return fmt.Errorf("client: failed to create task environment for task %q in allocation %q: %v",
   262  			r.task.Name, r.alloc.ID, err)
   263  	}
   264  
   265  	if r.task.Vault != nil {
   266  		// Read the token from the secret directory
   267  		tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   268  		data, err := ioutil.ReadFile(tokenPath)
   269  		if err != nil {
   270  			if !os.IsNotExist(err) {
   271  				return fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   272  			}
   273  
   274  			// Token file doesn't exist
   275  		} else {
   276  			// Store the recovered token
   277  			r.recoveredVaultToken = string(data)
   278  		}
   279  	}
   280  
   281  	// Restore the driver
   282  	if snap.HandleID != "" {
   283  		d, err := r.createDriver()
   284  		if err != nil {
   285  			return err
   286  		}
   287  
   288  		ctx := driver.NewExecContext(r.taskDir)
   289  		handle, err := d.Open(ctx, snap.HandleID)
   290  
   291  		// In the case it fails, we relaunch the task in the Run() method.
   292  		if err != nil {
   293  			r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v",
   294  				r.task.Name, r.alloc.ID, err)
   295  			return nil
   296  		}
   297  		r.handleLock.Lock()
   298  		r.handle = handle
   299  		r.handleLock.Unlock()
   300  
   301  		r.runningLock.Lock()
   302  		r.running = true
   303  		r.runningLock.Unlock()
   304  	}
   305  	return nil
   306  }
   307  
   308  // SaveState is used to snapshot our state
   309  func (r *TaskRunner) SaveState() error {
   310  	r.persistLock.Lock()
   311  	defer r.persistLock.Unlock()
   312  
   313  	snap := taskRunnerState{
   314  		Task:               r.task,
   315  		Version:            r.config.Version,
   316  		ArtifactDownloaded: r.artifactsDownloaded,
   317  		TaskDirBuilt:       r.taskDirBuilt,
   318  		PayloadRendered:    r.payloadRendered,
   319  		CreatedResources:   r.getCreatedResources(),
   320  	}
   321  
   322  	r.handleLock.Lock()
   323  	if r.handle != nil {
   324  		snap.HandleID = r.handle.ID()
   325  	}
   326  	r.handleLock.Unlock()
   327  	return persistState(r.stateFilePath(), &snap)
   328  }
   329  
   330  // DestroyState is used to cleanup after ourselves
   331  func (r *TaskRunner) DestroyState() error {
   332  	r.persistLock.Lock()
   333  	defer r.persistLock.Unlock()
   334  
   335  	return os.RemoveAll(r.stateFilePath())
   336  }
   337  
   338  // setState is used to update the state of the task runner
   339  func (r *TaskRunner) setState(state string, event *structs.TaskEvent) {
   340  	// Persist our state to disk.
   341  	if err := r.SaveState(); err != nil {
   342  		r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err)
   343  	}
   344  
   345  	// Indicate the task has been updated.
   346  	r.updater(r.task.Name, state, event)
   347  }
   348  
   349  // setTaskEnv sets the task environment. It returns an error if it could not be
   350  // created.
   351  func (r *TaskRunner) setTaskEnv() error {
   352  	r.taskEnvLock.Lock()
   353  	defer r.taskEnvLock.Unlock()
   354  
   355  	taskEnv, err := driver.GetTaskEnv(r.taskDir, r.config.Node,
   356  		r.task.Copy(), r.alloc, r.config, r.vaultFuture.Get())
   357  	if err != nil {
   358  		return err
   359  	}
   360  	r.taskEnv = taskEnv
   361  	return nil
   362  }
   363  
   364  // getTaskEnv returns the task environment
   365  func (r *TaskRunner) getTaskEnv() *env.TaskEnvironment {
   366  	r.taskEnvLock.Lock()
   367  	defer r.taskEnvLock.Unlock()
   368  	return r.taskEnv
   369  }
   370  
   371  // createDriver makes a driver for the task
   372  func (r *TaskRunner) createDriver() (driver.Driver, error) {
   373  	env := r.getTaskEnv()
   374  	if env == nil {
   375  		return nil, fmt.Errorf("task environment not made for task %q in allocation %q", r.task.Name, r.alloc.ID)
   376  	}
   377  
   378  	// Create a task-specific event emitter callback to expose minimal
   379  	// state to drivers
   380  	eventEmitter := func(m string, args ...interface{}) {
   381  		msg := fmt.Sprintf(m, args...)
   382  		r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg)
   383  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg))
   384  	}
   385  
   386  	driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, env, eventEmitter)
   387  	driver, err := driver.NewDriver(r.task.Driver, driverCtx)
   388  	if err != nil {
   389  		return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
   390  			r.task.Driver, r.alloc.ID, err)
   391  	}
   392  	return driver, err
   393  }
   394  
   395  // Run is a long running routine used to manage the task
   396  func (r *TaskRunner) Run() {
   397  	defer close(r.waitCh)
   398  	r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
   399  		r.task.Name, r.alloc.ID)
   400  
   401  	// Create the initial environment, this will be recreated if a Vault token
   402  	// is needed
   403  	if err := r.setTaskEnv(); err != nil {
   404  		r.setState(
   405  			structs.TaskStateDead,
   406  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err))
   407  		return
   408  	}
   409  
   410  	if err := r.validateTask(); err != nil {
   411  		r.setState(
   412  			structs.TaskStateDead,
   413  			structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask())
   414  		return
   415  	}
   416  
   417  	// Create a driver so that we can determine the FSIsolation required
   418  	drv, err := r.createDriver()
   419  	if err != nil {
   420  		e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   421  		r.setState(
   422  			structs.TaskStateDead,
   423  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask())
   424  		return
   425  	}
   426  
   427  	// Build base task directory structure regardless of FS isolation abilities.
   428  	// This needs to happen before we start the Vault manager and call prestart
   429  	// as both those can write to the task directories
   430  	if err := r.buildTaskDir(drv.FSIsolation()); err != nil {
   431  		e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err)
   432  		r.setState(
   433  			structs.TaskStateDead,
   434  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask())
   435  		return
   436  	}
   437  
   438  	// If there is no Vault policy leave the static future created in
   439  	// NewTaskRunner
   440  	if r.task.Vault != nil {
   441  		// Start the go-routine to get a Vault token
   442  		r.vaultFuture.Clear()
   443  		go r.vaultManager(r.recoveredVaultToken)
   444  	}
   445  
   446  	// Start the run loop
   447  	r.run()
   448  
   449  	// Do any cleanup necessary
   450  	r.postrun()
   451  
   452  	return
   453  }
   454  
   455  // validateTask validates the fields of the task and returns an error if the
   456  // task is invalid.
   457  func (r *TaskRunner) validateTask() error {
   458  	var mErr multierror.Error
   459  
   460  	// Validate the user.
   461  	unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist)
   462  	checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers)
   463  	if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch {
   464  		if _, unallowed := unallowedUsers[r.task.User]; unallowed {
   465  			mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User))
   466  		}
   467  	}
   468  
   469  	// Validate the artifacts
   470  	for i, artifact := range r.task.Artifacts {
   471  		// Verify the artifact doesn't escape the task directory.
   472  		if err := artifact.Validate(); err != nil {
   473  			// If this error occurs there is potentially a server bug or
   474  			// mallicious, server spoofing.
   475  			r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v",
   476  				r.alloc.ID, r.task.Name, artifact, i, err)
   477  			mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err))
   478  		}
   479  	}
   480  
   481  	// Validate the Service names
   482  	for i, service := range r.task.Services {
   483  		name := r.taskEnv.ReplaceEnv(service.Name)
   484  		if err := service.ValidateName(name); err != nil {
   485  			mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err))
   486  		}
   487  	}
   488  
   489  	if len(mErr.Errors) == 1 {
   490  		return mErr.Errors[0]
   491  	}
   492  	return mErr.ErrorOrNil()
   493  }
   494  
   495  // tokenFuture stores the Vault token and allows consumers to block till a valid
   496  // token exists
   497  type tokenFuture struct {
   498  	waiting []chan struct{}
   499  	token   string
   500  	set     bool
   501  	m       sync.Mutex
   502  }
   503  
   504  // NewTokenFuture returns a new token future without any token set
   505  func NewTokenFuture() *tokenFuture {
   506  	return &tokenFuture{}
   507  }
   508  
   509  // Wait returns a channel that can be waited on. When this channel unblocks, a
   510  // valid token will be available via the Get method
   511  func (f *tokenFuture) Wait() <-chan struct{} {
   512  	f.m.Lock()
   513  	defer f.m.Unlock()
   514  
   515  	c := make(chan struct{})
   516  	if f.set {
   517  		close(c)
   518  		return c
   519  	}
   520  
   521  	f.waiting = append(f.waiting, c)
   522  	return c
   523  }
   524  
   525  // Set sets the token value and unblocks any caller of Wait
   526  func (f *tokenFuture) Set(token string) *tokenFuture {
   527  	f.m.Lock()
   528  	defer f.m.Unlock()
   529  
   530  	f.set = true
   531  	f.token = token
   532  	for _, w := range f.waiting {
   533  		close(w)
   534  	}
   535  	f.waiting = nil
   536  	return f
   537  }
   538  
   539  // Clear clears the set vault token.
   540  func (f *tokenFuture) Clear() *tokenFuture {
   541  	f.m.Lock()
   542  	defer f.m.Unlock()
   543  
   544  	f.token = ""
   545  	f.set = false
   546  	return f
   547  }
   548  
   549  // Get returns the set Vault token
   550  func (f *tokenFuture) Get() string {
   551  	f.m.Lock()
   552  	defer f.m.Unlock()
   553  	return f.token
   554  }
   555  
   556  // vaultManager should be called in a go-routine and manages the derivation,
   557  // renewal and handling of errors with the Vault token. The optional parameter
   558  // allows setting the initial Vault token. This is useful when the Vault token
   559  // is recovered off disk.
   560  func (r *TaskRunner) vaultManager(token string) {
   561  	// Helper for stopping token renewal
   562  	stopRenewal := func() {
   563  		if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil {
   564  			r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   565  		}
   566  	}
   567  
   568  	// updatedToken lets us store state between loops. If true, a new token
   569  	// has been retrieved and we need to apply the Vault change mode
   570  	var updatedToken bool
   571  
   572  OUTER:
   573  	for {
   574  		// Check if we should exit
   575  		select {
   576  		case <-r.waitCh:
   577  			stopRenewal()
   578  			return
   579  		default:
   580  		}
   581  
   582  		// Clear the token
   583  		r.vaultFuture.Clear()
   584  
   585  		// Check if there already is a token which can be the case for
   586  		// restoring the TaskRunner
   587  		if token == "" {
   588  			// Get a token
   589  			var exit bool
   590  			token, exit = r.deriveVaultToken()
   591  			if exit {
   592  				// Exit the manager
   593  				return
   594  			}
   595  
   596  			// Write the token to disk
   597  			if err := r.writeToken(token); err != nil {
   598  				e := fmt.Errorf("failed to write Vault token to disk")
   599  				r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err)
   600  				r.Kill("vault", e.Error(), true)
   601  				return
   602  			}
   603  		}
   604  
   605  		// Start the renewal process
   606  		renewCh, err := r.vaultClient.RenewToken(token, 30)
   607  
   608  		// An error returned means the token is not being renewed
   609  		if err != nil {
   610  			r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   611  			token = ""
   612  			goto OUTER
   613  		}
   614  
   615  		// The Vault token is valid now, so set it
   616  		r.vaultFuture.Set(token)
   617  
   618  		if updatedToken {
   619  			switch r.task.Vault.ChangeMode {
   620  			case structs.VaultChangeModeSignal:
   621  				s, err := signals.Parse(r.task.Vault.ChangeSignal)
   622  				if err != nil {
   623  					e := fmt.Errorf("failed to parse signal: %v", err)
   624  					r.logger.Printf("[ERR] client: %v", err)
   625  					r.Kill("vault", e.Error(), true)
   626  					return
   627  				}
   628  
   629  				if err := r.Signal("vault", "new Vault token acquired", s); err != nil {
   630  					r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   631  					r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true)
   632  					return
   633  				}
   634  			case structs.VaultChangeModeRestart:
   635  				r.Restart("vault", "new Vault token acquired")
   636  			case structs.VaultChangeModeNoop:
   637  				fallthrough
   638  			default:
   639  				r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode)
   640  			}
   641  
   642  			// We have handled it
   643  			updatedToken = false
   644  
   645  			// Call the handler
   646  			r.updatedTokenHandler()
   647  		}
   648  
   649  		// Start watching for renewal errors
   650  		select {
   651  		case err := <-renewCh:
   652  			// Clear the token
   653  			token = ""
   654  			r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   655  			stopRenewal()
   656  
   657  			// Check if we have to do anything
   658  			if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop {
   659  				updatedToken = true
   660  			}
   661  		case <-r.waitCh:
   662  			stopRenewal()
   663  			return
   664  		}
   665  	}
   666  }
   667  
   668  // deriveVaultToken derives the Vault token using exponential backoffs. It
   669  // returns the Vault token and whether the manager should exit.
   670  func (r *TaskRunner) deriveVaultToken() (token string, exit bool) {
   671  	attempts := 0
   672  	for {
   673  		tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name})
   674  		if err == nil {
   675  			return tokens[r.task.Name], false
   676  		}
   677  
   678  		// Check if we can't recover from the error
   679  		if !structs.IsRecoverable(err) {
   680  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   681  				r.task.Name, r.alloc.ID, err)
   682  			r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true)
   683  			return "", true
   684  		}
   685  
   686  		// Handle the retry case
   687  		backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
   688  		if backoff > vaultBackoffLimit {
   689  			backoff = vaultBackoffLimit
   690  		}
   691  		r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v",
   692  			r.task.Name, r.alloc.ID, err, backoff)
   693  
   694  		attempts++
   695  
   696  		// Wait till retrying
   697  		select {
   698  		case <-r.waitCh:
   699  			return "", true
   700  		case <-time.After(backoff):
   701  		}
   702  	}
   703  }
   704  
   705  // writeToken writes the given token to disk
   706  func (r *TaskRunner) writeToken(token string) error {
   707  	tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   708  	if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil {
   709  		return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   710  	}
   711  
   712  	return nil
   713  }
   714  
   715  // updatedTokenHandler is called when a new Vault token is retrieved. Things
   716  // that rely on the token should be updated here.
   717  func (r *TaskRunner) updatedTokenHandler() {
   718  
   719  	// Update the tasks environment
   720  	if err := r.setTaskEnv(); err != nil {
   721  		r.setState(
   722  			structs.TaskStateDead,
   723  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   724  		return
   725  	}
   726  
   727  	if r.templateManager != nil {
   728  		r.templateManager.Stop()
   729  
   730  		// Create a new templateManager
   731  		var err error
   732  		r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates,
   733  			r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv())
   734  		if err != nil {
   735  			err := fmt.Errorf("failed to build task's template manager: %v", err)
   736  			r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   737  			r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
   738  			r.Kill("vault", err.Error(), true)
   739  			return
   740  		}
   741  	}
   742  }
   743  
   744  // prestart handles life-cycle tasks that occur before the task has started.
   745  func (r *TaskRunner) prestart(resultCh chan bool) {
   746  	if r.task.Vault != nil {
   747  		// Wait for the token
   748  		r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID)
   749  		tokenCh := r.vaultFuture.Wait()
   750  		select {
   751  		case <-tokenCh:
   752  		case <-r.waitCh:
   753  			resultCh <- false
   754  			return
   755  		}
   756  		r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID)
   757  	}
   758  
   759  	if err := r.setTaskEnv(); err != nil {
   760  		r.setState(
   761  			structs.TaskStateDead,
   762  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   763  		resultCh <- false
   764  		return
   765  	}
   766  
   767  	// If the job is a dispatch job and there is a payload write it to disk
   768  	requirePayload := len(r.alloc.Job.Payload) != 0 &&
   769  		(r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "")
   770  	if !r.payloadRendered && requirePayload {
   771  		renderTo := filepath.Join(r.taskDir.LocalDir, r.task.DispatchPayload.File)
   772  		decoded, err := snappy.Decode(nil, r.alloc.Job.Payload)
   773  		if err != nil {
   774  			r.setState(
   775  				structs.TaskStateDead,
   776  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   777  			resultCh <- false
   778  			return
   779  		}
   780  
   781  		if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil {
   782  			r.setState(
   783  				structs.TaskStateDead,
   784  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   785  			resultCh <- false
   786  			return
   787  		}
   788  
   789  		if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil {
   790  			r.setState(
   791  				structs.TaskStateDead,
   792  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   793  			resultCh <- false
   794  			return
   795  		}
   796  
   797  		r.payloadRendered = true
   798  	}
   799  
   800  	for {
   801  		r.persistLock.Lock()
   802  		downloaded := r.artifactsDownloaded
   803  		r.persistLock.Unlock()
   804  
   805  		// Download the task's artifacts
   806  		if !downloaded && len(r.task.Artifacts) > 0 {
   807  			r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts))
   808  			for _, artifact := range r.task.Artifacts {
   809  				if err := getter.GetArtifact(r.getTaskEnv(), artifact, r.taskDir.Dir); err != nil {
   810  					wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err)
   811  					r.logger.Printf("[DEBUG] client: %v", wrapped)
   812  					r.setState(structs.TaskStatePending,
   813  						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped))
   814  					r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err))
   815  					goto RESTART
   816  				}
   817  			}
   818  
   819  			r.persistLock.Lock()
   820  			r.artifactsDownloaded = true
   821  			r.persistLock.Unlock()
   822  		}
   823  
   824  		// We don't have to wait for any template
   825  		if len(r.task.Templates) == 0 {
   826  			// Send the start signal
   827  			select {
   828  			case r.startCh <- struct{}{}:
   829  			default:
   830  			}
   831  
   832  			resultCh <- true
   833  			return
   834  		}
   835  
   836  		// Build the template manager
   837  		if r.templateManager == nil {
   838  			var err error
   839  			r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates,
   840  				r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv())
   841  			if err != nil {
   842  				err := fmt.Errorf("failed to build task's template manager: %v", err)
   843  				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   844  				r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
   845  				resultCh <- false
   846  				return
   847  			}
   848  		}
   849  
   850  		// Block for consul-template
   851  		// TODO Hooks should register themselves as blocking and then we can
   852  		// perioidcally enumerate what we are still blocked on
   853  		select {
   854  		case <-r.unblockCh:
   855  			// Send the start signal
   856  			select {
   857  			case r.startCh <- struct{}{}:
   858  			default:
   859  			}
   860  
   861  			resultCh <- true
   862  			return
   863  		case <-r.waitCh:
   864  			// The run loop has exited so exit too
   865  			resultCh <- false
   866  			return
   867  		}
   868  
   869  	RESTART:
   870  		restart := r.shouldRestart()
   871  		if !restart {
   872  			resultCh <- false
   873  			return
   874  		}
   875  	}
   876  }
   877  
   878  // postrun is used to do any cleanup that is necessary after exiting the runloop
   879  func (r *TaskRunner) postrun() {
   880  
   881  	// Stop the template manager
   882  	if r.templateManager != nil {
   883  		r.templateManager.Stop()
   884  	}
   885  
   886  	// Unofficial Feature: Forcefully stop the associated container if still running
   887  	if err := r.taskRunnerPlus.stopContainer(); err != nil {
   888  		r.logger.Printf("[DEBUG] %s", err.Error())
   889  	}
   890  }
   891  
   892  // run is the main run loop that handles starting the application, destroying
   893  // it, restarts and signals.
   894  func (r *TaskRunner) run() {
   895  	// Predeclare things so we can jump to the RESTART
   896  	var stopCollection chan struct{}
   897  	var handleWaitCh chan *dstructs.WaitResult
   898  
   899  	// If we already have a handle, populate the stopCollection and handleWaitCh
   900  	// to fix the invariant that it exists.
   901  	r.handleLock.Lock()
   902  	handleEmpty := r.handle == nil
   903  	r.handleLock.Unlock()
   904  
   905  	if !handleEmpty {
   906  		stopCollection = make(chan struct{})
   907  		go r.collectResourceUsageStats(stopCollection)
   908  		handleWaitCh = r.handle.WaitCh()
   909  	}
   910  
   911  	for {
   912  		// Do the prestart activities
   913  		prestartResultCh := make(chan bool, 1)
   914  		go r.prestart(prestartResultCh)
   915  
   916  	WAIT:
   917  		for {
   918  			select {
   919  			case success := <-prestartResultCh:
   920  				if !success {
   921  					r.cleanup()
   922  					r.setState(structs.TaskStateDead, nil)
   923  					return
   924  				}
   925  			case <-r.startCh:
   926  				// Start the task if not yet started or it is being forced. This logic
   927  				// is necessary because in the case of a restore the handle already
   928  				// exists.
   929  				r.handleLock.Lock()
   930  				handleEmpty := r.handle == nil
   931  				r.handleLock.Unlock()
   932  				if handleEmpty {
   933  					startErr := r.startTask()
   934  					r.restartTracker.SetStartError(startErr)
   935  					if startErr != nil {
   936  						r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr))
   937  						goto RESTART
   938  					}
   939  
   940  					// Mark the task as started
   941  					r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
   942  					r.runningLock.Lock()
   943  					r.running = true
   944  					r.runningLock.Unlock()
   945  
   946  					if stopCollection == nil {
   947  						stopCollection = make(chan struct{})
   948  						go r.collectResourceUsageStats(stopCollection)
   949  					}
   950  
   951  					handleWaitCh = r.handle.WaitCh()
   952  				}
   953  
   954  			case waitRes := <-handleWaitCh:
   955  				if waitRes == nil {
   956  					panic("nil wait")
   957  				}
   958  
   959  				r.runningLock.Lock()
   960  				r.running = false
   961  				r.runningLock.Unlock()
   962  
   963  				// Stop collection of the task's resource usage
   964  				close(stopCollection)
   965  
   966  				// Log whether the task was successful or not.
   967  				r.restartTracker.SetWaitResult(waitRes)
   968  				r.setState("", r.waitErrorToEvent(waitRes))
   969  				if !waitRes.Successful() {
   970  					r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes)
   971  				} else {
   972  					r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID)
   973  				}
   974  
   975  				break WAIT
   976  			case update := <-r.updateCh:
   977  				if err := r.handleUpdate(update); err != nil {
   978  					r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err)
   979  				}
   980  
   981  			case se := <-r.signalCh:
   982  				r.runningLock.Lock()
   983  				running := r.running
   984  				r.runningLock.Unlock()
   985  				common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID)
   986  				if !running {
   987  					// Send no error
   988  					r.logger.Printf("[DEBUG] client: skipping %s", common)
   989  					se.result <- nil
   990  					continue
   991  				}
   992  
   993  				r.logger.Printf("[DEBUG] client: sending %s", common)
   994  				r.setState(structs.TaskStateRunning, se.e)
   995  
   996  				res := r.handle.Signal(se.s)
   997  				se.result <- res
   998  
   999  			case event := <-r.restartCh:
  1000  				r.runningLock.Lock()
  1001  				running := r.running
  1002  				r.runningLock.Unlock()
  1003  				common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID)
  1004  				if !running {
  1005  					r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common)
  1006  					continue
  1007  				}
  1008  
  1009  				r.logger.Printf("[DEBUG] client: restarting %s: %v", common, event.RestartReason)
  1010  				r.setState(structs.TaskStateRunning, event)
  1011  				r.killTask(nil)
  1012  
  1013  				close(stopCollection)
  1014  
  1015  				if handleWaitCh != nil {
  1016  					<-handleWaitCh
  1017  				}
  1018  
  1019  				// Since the restart isn't from a failure, restart immediately
  1020  				// and don't count against the restart policy
  1021  				r.restartTracker.SetRestartTriggered()
  1022  				break WAIT
  1023  
  1024  			case <-r.destroyCh:
  1025  
  1026  				r.runningLock.Lock()
  1027  				running := r.running
  1028  				r.runningLock.Unlock()
  1029  				if !running {
  1030  					r.cleanup()
  1031  					r.setState(structs.TaskStateDead, r.destroyEvent)
  1032  					return
  1033  				}
  1034  
  1035  				// Store the task event that provides context on the task
  1036  				// destroy. The Killed event is set from the alloc_runner and
  1037  				// doesn't add detail
  1038  				var killEvent *structs.TaskEvent
  1039  				if r.destroyEvent.Type != structs.TaskKilled {
  1040  					if r.destroyEvent.Type == structs.TaskKilling {
  1041  						killEvent = r.destroyEvent
  1042  					} else {
  1043  						r.setState(structs.TaskStateRunning, r.destroyEvent)
  1044  					}
  1045  				}
  1046  
  1047  				r.killTask(killEvent)
  1048  				close(stopCollection)
  1049  
  1050  				// Wait for handler to exit before calling cleanup
  1051  				<-handleWaitCh
  1052  				r.cleanup()
  1053  
  1054  				r.setState(structs.TaskStateDead, nil)
  1055  				return
  1056  			}
  1057  		}
  1058  
  1059  	RESTART:
  1060  		restart := r.shouldRestart()
  1061  		if !restart {
  1062  			r.cleanup()
  1063  			r.setState(structs.TaskStateDead, nil)
  1064  			return
  1065  		}
  1066  
  1067  		// Clear the handle so a new driver will be created.
  1068  		r.handleLock.Lock()
  1069  		r.handle = nil
  1070  		handleWaitCh = nil
  1071  		stopCollection = nil
  1072  		r.handleLock.Unlock()
  1073  	}
  1074  }
  1075  
  1076  // cleanup calls Driver.Cleanup when a task is stopping. Errors are logged.
  1077  func (r *TaskRunner) cleanup() {
  1078  
  1079  	drv, err := r.createDriver()
  1080  	if err != nil {
  1081  		r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err)
  1082  		return
  1083  	}
  1084  
  1085  	res := r.getCreatedResources()
  1086  
  1087  	ctx := driver.NewExecContext(r.taskDir)
  1088  	attempts := 1
  1089  	var cleanupErr error
  1090  	for retry := true; retry; attempts++ {
  1091  		cleanupErr = drv.Cleanup(ctx, res)
  1092  		retry = structs.IsRecoverable(cleanupErr)
  1093  
  1094  		// Copy current createdResources state in case SaveState is
  1095  		// called between retries
  1096  		r.setCreatedResources(res)
  1097  
  1098  		// Retry 3 times with sleeps between
  1099  		if !retry || attempts > 3 {
  1100  			break
  1101  		}
  1102  		time.Sleep(time.Duration(attempts) * time.Second)
  1103  	}
  1104  
  1105  	if cleanupErr != nil {
  1106  		r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr)
  1107  	}
  1108  
  1109  	// Unofficial Feature: Forcefully stop the associated container
  1110  	if err := r.taskRunnerPlus.stopContainer(); err != nil {
  1111  		r.logger.Printf("[DEBUG] %s", err.Error())
  1112  	}
  1113  
  1114  	return
  1115  }
  1116  
  1117  // shouldRestart returns if the task should restart. If the return value is
  1118  // true, the task's restart policy has already been considered and any wait time
  1119  // between restarts has been applied.
  1120  func (r *TaskRunner) shouldRestart() bool {
  1121  	state, when := r.restartTracker.GetState()
  1122  	reason := r.restartTracker.GetReason()
  1123  	switch state {
  1124  	case structs.TaskNotRestarting, structs.TaskTerminated:
  1125  		r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
  1126  		if state == structs.TaskNotRestarting {
  1127  			r.setState(structs.TaskStateDead,
  1128  				structs.NewTaskEvent(structs.TaskNotRestarting).
  1129  					SetRestartReason(reason).SetFailsTask())
  1130  		}
  1131  		return false
  1132  	case structs.TaskRestarting:
  1133  
  1134  		// Unofficial Feature: Forcefully stop the associated container
  1135  		if err := r.taskRunnerPlus.stopContainer(); err != nil {
  1136  			r.logger.Printf("[DEBUG] %s", err.Error())
  1137  		}
  1138  
  1139  		r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
  1140  		r.setState(structs.TaskStatePending,
  1141  			structs.NewTaskEvent(structs.TaskRestarting).
  1142  				SetRestartDelay(when).
  1143  				SetRestartReason(reason))
  1144  	default:
  1145  		r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
  1146  		return false
  1147  	}
  1148  
  1149  	// Sleep but watch for destroy events.
  1150  	select {
  1151  	case <-time.After(when):
  1152  	case <-r.destroyCh:
  1153  	}
  1154  
  1155  	// Destroyed while we were waiting to restart, so abort.
  1156  	r.destroyLock.Lock()
  1157  	destroyed := r.destroy
  1158  	r.destroyLock.Unlock()
  1159  	if destroyed {
  1160  		r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name)
  1161  		r.setState(structs.TaskStateDead, r.destroyEvent)
  1162  		return false
  1163  	}
  1164  
  1165  	return true
  1166  }
  1167  
  1168  // killTask kills the running task. A killing event can optionally be passed and
  1169  // this event is used to mark the task as being killed. It provides a means to
  1170  // store extra information.
  1171  func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) {
  1172  	r.runningLock.Lock()
  1173  	running := r.running
  1174  	r.runningLock.Unlock()
  1175  	if !running {
  1176  		return
  1177  	}
  1178  
  1179  	// Get the kill timeout
  1180  	timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)
  1181  
  1182  	// Build the event
  1183  	var event *structs.TaskEvent
  1184  	if killingEvent != nil {
  1185  		event = killingEvent
  1186  		event.Type = structs.TaskKilling
  1187  	} else {
  1188  		event = structs.NewTaskEvent(structs.TaskKilling)
  1189  	}
  1190  	event.SetKillTimeout(timeout)
  1191  
  1192  	// Mark that we received the kill event
  1193  	r.setState(structs.TaskStateRunning, event)
  1194  
  1195  	// Kill the task using an exponential backoff in-case of failures.
  1196  	destroySuccess, err := r.handleDestroy()
  1197  	if !destroySuccess {
  1198  		// We couldn't successfully destroy the resource created.
  1199  		r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
  1200  	}
  1201  
  1202  	r.runningLock.Lock()
  1203  	r.running = false
  1204  	r.runningLock.Unlock()
  1205  
  1206  	// TODO: Send GRPC signal to container
  1207  	r.taskRunnerPlus.SendGRPCSignal(5 * time.Second)
  1208  
  1209  	// Unofficial Feature: Forcefully stop the associated container if still running
  1210  	if err := r.taskRunnerPlus.stopContainer(); err != nil {
  1211  		r.logger.Printf("[DEBUG] %s", err.Error())
  1212  	}
  1213  
  1214  	// Store that the task has been destroyed and any associated error.
  1215  	r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err))
  1216  }
  1217  
  1218  // startTask creates the driver, task dir, and starts the task.
  1219  func (r *TaskRunner) startTask() error {
  1220  
  1221  	// Create a driver
  1222  	drv, err := r.createDriver()
  1223  	if err != nil {
  1224  		return fmt.Errorf("failed to create driver of task %q for alloc %q: %v",
  1225  			r.task.Name, r.alloc.ID, err)
  1226  	}
  1227  
  1228  	// Since raw exec driver has no resource allocation support,
  1229  	// we introduce a memory availability check before task is started.
  1230  	// Fires a SetupFailure event if memory requirement is not met.
  1231  	if r.task.Driver == "raw_exec" {
  1232  		expectedMemStr := r.getTaskEnv().Env[r.taskRunnerPlus.MemoryAllocEnvKey]
  1233  		if len(expectedMemStr) > 0 {
  1234  			expectedMem, _ := strconv.Atoi(expectedMemStr)
  1235  			err := r.taskRunnerPlus.KillOnLowMemory(expectedMem, func() error {
  1236  				wrapped := fmt.Errorf("insufficient memory")
  1237  				r.killTask(structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(wrapped))
  1238  				r.logger.Printf("[DEBUG] client: insufficient memory for raw_exec task. Task will be killed")
  1239  				return wrapped
  1240  			})
  1241  			if err != nil {
  1242  				return err
  1243  			}
  1244  		}
  1245  	}
  1246  
  1247  	// Run prestart
  1248  	ctx := driver.NewExecContext(r.taskDir)
  1249  	res, err := drv.Prestart(ctx, r.task)
  1250  
  1251  	// Merge newly created resources into previously created resources
  1252  	r.createdResourcesLock.Lock()
  1253  	r.createdResources.Merge(res)
  1254  	r.createdResourcesLock.Unlock()
  1255  
  1256  	if err != nil {
  1257  		wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v",
  1258  			r.task.Name, r.alloc.ID, err)
  1259  		r.logger.Printf("[WARN] client: error from prestart: %s", wrapped)
  1260  		return structs.WrapRecoverable(wrapped, err)
  1261  	}
  1262  
  1263  	// Start the job
  1264  	handle, err := drv.Start(ctx, r.task)
  1265  	if err != nil {
  1266  		wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v",
  1267  			r.task.Name, r.alloc.ID, err)
  1268  		r.logger.Printf("[WARN] client: %s", wrapped)
  1269  		return structs.WrapRecoverable(wrapped, err)
  1270  
  1271  	}
  1272  
  1273  	r.handleLock.Lock()
  1274  	r.handle = handle
  1275  	r.handleLock.Unlock()
  1276  	return nil
  1277  }
  1278  
  1279  // buildTaskDir creates the task directory before driver.Prestart. It is safe
  1280  // to call multiple times as its state is persisted.
  1281  func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error {
  1282  	r.persistLock.Lock()
  1283  	built := r.taskDirBuilt
  1284  	r.persistLock.Unlock()
  1285  
  1286  	// We do not set the state again since this only occurs during restoration
  1287  	// and the task dir is already built. The reason we call Build again is to
  1288  	// ensure that the task dir invariants are still held.
  1289  	if !built {
  1290  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskSetup).
  1291  			SetMessage(structs.TaskBuildingTaskDir))
  1292  	}
  1293  
  1294  	chroot := config.DefaultChrootEnv
  1295  	if len(r.config.ChrootEnv) > 0 {
  1296  		chroot = r.config.ChrootEnv
  1297  	}
  1298  	if err := r.taskDir.Build(built, chroot, fsi); err != nil {
  1299  		return err
  1300  	}
  1301  
  1302  	// Mark task dir as successfully built
  1303  	r.persistLock.Lock()
  1304  	r.taskDirBuilt = true
  1305  	r.persistLock.Unlock()
  1306  	return nil
  1307  }
  1308  
  1309  // collectResourceUsageStats starts collecting resource usage stats of a Task.
  1310  // Collection ends when the passed channel is closed
  1311  func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) {
  1312  	// start collecting the stats right away and then start collecting every
  1313  	// collection interval
  1314  	next := time.NewTimer(0)
  1315  	defer next.Stop()
  1316  	for {
  1317  		select {
  1318  		case <-next.C:
  1319  			next.Reset(r.config.StatsCollectionInterval)
  1320  			if r.handle == nil {
  1321  				continue
  1322  			}
  1323  			ru, err := r.handle.Stats()
  1324  
  1325  			if err != nil {
  1326  				// Check if the driver doesn't implement stats
  1327  				if err.Error() == driver.DriverStatsNotImplemented.Error() {
  1328  					r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID)
  1329  					return
  1330  				}
  1331  
  1332  				// We do not log when the plugin is shutdown as this is simply a
  1333  				// race between the stopCollection channel being closed and calling
  1334  				// Stats on the handle.
  1335  				if !strings.Contains(err.Error(), "connection is shut down") {
  1336  					r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err)
  1337  				}
  1338  				continue
  1339  			}
  1340  
  1341  			r.resourceUsageLock.Lock()
  1342  			r.resourceUsage = ru
  1343  			r.resourceUsageLock.Unlock()
  1344  			if ru != nil {
  1345  				r.emitStats(ru)
  1346  			}
  1347  		case <-stopCollection:
  1348  			return
  1349  		}
  1350  	}
  1351  }
  1352  
  1353  // LatestResourceUsage returns the last resource utilization datapoint collected
  1354  func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1355  	r.resourceUsageLock.RLock()
  1356  	defer r.resourceUsageLock.RUnlock()
  1357  	r.runningLock.Lock()
  1358  	defer r.runningLock.Unlock()
  1359  
  1360  	// If the task is not running there can be no latest resource
  1361  	if !r.running {
  1362  		return nil
  1363  	}
  1364  
  1365  	return r.resourceUsage
  1366  }
  1367  
  1368  // handleUpdate takes an updated allocation and updates internal state to
  1369  // reflect the new config for the task.
  1370  func (r *TaskRunner) handleUpdate(update *structs.Allocation) error {
  1371  	// Extract the task group from the alloc.
  1372  	tg := update.Job.LookupTaskGroup(update.TaskGroup)
  1373  	if tg == nil {
  1374  		return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup)
  1375  	}
  1376  
  1377  	// Extract the task.
  1378  	var updatedTask *structs.Task
  1379  	for _, t := range tg.Tasks {
  1380  		if t.Name == r.task.Name {
  1381  			updatedTask = t.Copy()
  1382  		}
  1383  	}
  1384  	if updatedTask == nil {
  1385  		return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name)
  1386  	}
  1387  
  1388  	// Merge in the task resources
  1389  	updatedTask.Resources = update.TaskResources[updatedTask.Name]
  1390  
  1391  	// Update will update resources and store the new kill timeout.
  1392  	var mErr multierror.Error
  1393  	r.handleLock.Lock()
  1394  	if r.handle != nil {
  1395  		if err := r.handle.Update(updatedTask); err != nil {
  1396  			mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err))
  1397  		}
  1398  	}
  1399  	r.handleLock.Unlock()
  1400  
  1401  	// Update the restart policy.
  1402  	if r.restartTracker != nil {
  1403  		r.restartTracker.SetPolicy(tg.RestartPolicy)
  1404  	}
  1405  
  1406  	// Store the updated alloc.
  1407  	r.alloc = update
  1408  	r.task = updatedTask
  1409  	return mErr.ErrorOrNil()
  1410  }
  1411  
  1412  // handleDestroy kills the task handle. In the case that killing fails,
  1413  // handleDestroy will retry with an exponential backoff and will give up at a
  1414  // given limit. It returns whether the task was destroyed and the error
  1415  // associated with the last kill attempt.
  1416  func (r *TaskRunner) handleDestroy() (destroyed bool, err error) {
  1417  	// Cap the number of times we attempt to kill the task.
  1418  	for i := 0; i < killFailureLimit; i++ {
  1419  		if err = r.handle.Kill(); err != nil {
  1420  			// Calculate the new backoff
  1421  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
  1422  			if backoff > killBackoffLimit {
  1423  				backoff = killBackoffLimit
  1424  			}
  1425  
  1426  			r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v",
  1427  				r.task.Name, r.alloc.ID, backoff, err)
  1428  			time.Sleep(time.Duration(backoff))
  1429  		} else {
  1430  			// Kill was successful
  1431  			return true, nil
  1432  		}
  1433  	}
  1434  	return
  1435  }
  1436  
  1437  // Restart will restart the task
  1438  func (r *TaskRunner) Restart(source, reason string) {
  1439  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1440  	event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr)
  1441  
  1442  	select {
  1443  	case r.restartCh <- event:
  1444  	case <-r.waitCh:
  1445  	}
  1446  }
  1447  
  1448  // Signal will send a signal to the task
  1449  func (r *TaskRunner) Signal(source, reason string, s os.Signal) error {
  1450  
  1451  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1452  	event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr)
  1453  
  1454  	resCh := make(chan error)
  1455  	se := SignalEvent{
  1456  		s:      s,
  1457  		e:      event,
  1458  		result: resCh,
  1459  	}
  1460  
  1461  	select {
  1462  	case r.signalCh <- se:
  1463  	case <-r.waitCh:
  1464  	}
  1465  
  1466  	return <-resCh
  1467  }
  1468  
  1469  // Kill will kill a task and store the error, no longer restarting the task. If
  1470  // fail is set, the task is marked as having failed.
  1471  func (r *TaskRunner) Kill(source, reason string, fail bool) {
  1472  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1473  	event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr)
  1474  	if fail {
  1475  		event.SetFailsTask()
  1476  	}
  1477  
  1478  	r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr)
  1479  	r.Destroy(event)
  1480  }
  1481  
  1482  // UnblockStart unblocks the starting of the task. It currently assumes only
  1483  // consul-template will unblock
  1484  func (r *TaskRunner) UnblockStart(source string) {
  1485  	r.unblockLock.Lock()
  1486  	defer r.unblockLock.Unlock()
  1487  	if r.unblocked {
  1488  		return
  1489  	}
  1490  
  1491  	r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source)
  1492  	r.unblocked = true
  1493  	close(r.unblockCh)
  1494  }
  1495  
  1496  // Helper function for converting a WaitResult into a TaskTerminated event.
  1497  func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent {
  1498  	return structs.NewTaskEvent(structs.TaskTerminated).
  1499  		SetExitCode(res.ExitCode).
  1500  		SetSignal(res.Signal).
  1501  		SetExitMessage(res.Err)
  1502  }
  1503  
  1504  // Update is used to update the task of the context
  1505  func (r *TaskRunner) Update(update *structs.Allocation) {
  1506  	select {
  1507  	case r.updateCh <- update:
  1508  	default:
  1509  		r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
  1510  			r.task.Name, r.alloc.ID)
  1511  	}
  1512  }
  1513  
  1514  // Destroy is used to indicate that the task context should be destroyed. The
  1515  // event parameter provides a context for the destroy.
  1516  func (r *TaskRunner) Destroy(event *structs.TaskEvent) {
  1517  	r.destroyLock.Lock()
  1518  	defer r.destroyLock.Unlock()
  1519  
  1520  	if r.destroy {
  1521  		return
  1522  	}
  1523  	r.destroy = true
  1524  	r.destroyEvent = event
  1525  	close(r.destroyCh)
  1526  }
  1527  
  1528  // getCreatedResources returns the resources created by drivers. It will never
  1529  // return nil.
  1530  func (r *TaskRunner) getCreatedResources() *driver.CreatedResources {
  1531  	r.createdResourcesLock.Lock()
  1532  	if r.createdResources == nil {
  1533  		r.createdResources = driver.NewCreatedResources()
  1534  	}
  1535  	cr := r.createdResources.Copy()
  1536  	r.createdResourcesLock.Unlock()
  1537  
  1538  	return cr
  1539  }
  1540  
  1541  // setCreatedResources updates the resources created by drivers. If passed nil
  1542  // it will set createdResources to an initialized struct.
  1543  func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) {
  1544  	if cr == nil {
  1545  		cr = driver.NewCreatedResources()
  1546  	}
  1547  	r.createdResourcesLock.Lock()
  1548  	r.createdResources = cr.Copy()
  1549  	r.createdResourcesLock.Unlock()
  1550  }
  1551  
  1552  // emitStats emits resource usage stats of tasks to remote metrics collector
  1553  // sinks
  1554  func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1555  	if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics {
  1556  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1557  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1558  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1559  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1560  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1561  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1562  	}
  1563  
  1564  	if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics {
  1565  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1566  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1567  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1568  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1569  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1570  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1571  	}
  1572  }