github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/client/task_runner.go (about)

     1  package client
     2  
     3  import (
     4  	"crypto/md5"
     5  	"encoding/hex"
     6  	"fmt"
     7  	"io/ioutil"
     8  	"log"
     9  	"os"
    10  	"path/filepath"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/armon/go-metrics"
    16  	"github.com/golang/snappy"
    17  	"github.com/hashicorp/consul-template/signals"
    18  	"github.com/hashicorp/go-multierror"
    19  	"github.com/hashicorp/nomad/client/allocdir"
    20  	"github.com/hashicorp/nomad/client/config"
    21  	"github.com/hashicorp/nomad/client/driver"
    22  	"github.com/hashicorp/nomad/client/getter"
    23  	"github.com/hashicorp/nomad/client/vaultclient"
    24  	"github.com/hashicorp/nomad/nomad/structs"
    25  
    26  	"github.com/hashicorp/nomad/client/driver/env"
    27  	dstructs "github.com/hashicorp/nomad/client/driver/structs"
    28  	cstructs "github.com/hashicorp/nomad/client/structs"
    29  )
    30  
    31  const (
    32  	// killBackoffBaseline is the baseline time for exponential backoff while
    33  	// killing a task.
    34  	killBackoffBaseline = 5 * time.Second
    35  
    36  	// killBackoffLimit is the limit of the exponential backoff for killing
    37  	// the task.
    38  	killBackoffLimit = 2 * time.Minute
    39  
    40  	// killFailureLimit is how many times we will attempt to kill a task before
    41  	// giving up and potentially leaking resources.
    42  	killFailureLimit = 5
    43  
    44  	// vaultBackoffBaseline is the baseline time for exponential backoff when
    45  	// attempting to retrieve a Vault token
    46  	vaultBackoffBaseline = 5 * time.Second
    47  
    48  	// vaultBackoffLimit is the limit of the exponential backoff when attempting
    49  	// to retrieve a Vault token
    50  	vaultBackoffLimit = 3 * time.Minute
    51  
    52  	// vaultTokenFile is the name of the file holding the Vault token inside the
    53  	// task's secret directory
    54  	vaultTokenFile = "vault_token"
    55  )
    56  
    57  // TaskRunner is used to wrap a task within an allocation and provide the execution context.
    58  type TaskRunner struct {
    59  	config         *config.Config
    60  	updater        TaskStateUpdater
    61  	logger         *log.Logger
    62  	alloc          *structs.Allocation
    63  	restartTracker *RestartTracker
    64  
    65  	// running marks whether the task is running
    66  	running     bool
    67  	runningLock sync.Mutex
    68  
    69  	resourceUsage     *cstructs.TaskResourceUsage
    70  	resourceUsageLock sync.RWMutex
    71  
    72  	task    *structs.Task
    73  	taskDir *allocdir.TaskDir
    74  
    75  	// taskEnv is the environment variables of the task
    76  	taskEnv     *env.TaskEnvironment
    77  	taskEnvLock sync.Mutex
    78  
    79  	// updateCh is used to receive updated versions of the allocation
    80  	updateCh chan *structs.Allocation
    81  
    82  	handle     driver.DriverHandle
    83  	handleLock sync.Mutex
    84  
    85  	// artifactsDownloaded tracks whether the tasks artifacts have been
    86  	// downloaded
    87  	//
    88  	// Must acquire persistLock when accessing
    89  	artifactsDownloaded bool
    90  
    91  	// taskDirBuilt tracks whether the task has built its directory.
    92  	//
    93  	// Must acquire persistLock when accessing
    94  	taskDirBuilt bool
    95  
    96  	// createdResources are all the resources created by the task driver
    97  	// across all attempts to start the task.
    98  	// Simple gets and sets should use {get,set}CreatedResources
    99  	createdResources     *driver.CreatedResources
   100  	createdResourcesLock sync.Mutex
   101  
   102  	// payloadRendered tracks whether the payload has been rendered to disk
   103  	payloadRendered bool
   104  
   105  	// vaultFuture is the means to wait for and get a Vault token
   106  	vaultFuture *tokenFuture
   107  
   108  	// recoveredVaultToken is the token that was recovered through a restore
   109  	recoveredVaultToken string
   110  
   111  	// vaultClient is used to retrieve and renew any needed Vault token
   112  	vaultClient vaultclient.VaultClient
   113  
   114  	// templateManager is used to manage any consul-templates this task may have
   115  	templateManager *TaskTemplateManager
   116  
   117  	// startCh is used to trigger the start of the task
   118  	startCh chan struct{}
   119  
   120  	// unblockCh is used to unblock the starting of the task
   121  	unblockCh   chan struct{}
   122  	unblocked   bool
   123  	unblockLock sync.Mutex
   124  
   125  	// restartCh is used to restart a task
   126  	restartCh chan *structs.TaskEvent
   127  
   128  	// signalCh is used to send a signal to a task
   129  	signalCh chan SignalEvent
   130  
   131  	destroy      bool
   132  	destroyCh    chan struct{}
   133  	destroyLock  sync.Mutex
   134  	destroyEvent *structs.TaskEvent
   135  
   136  	// waitCh closing marks the run loop as having exited
   137  	waitCh chan struct{}
   138  
   139  	// persistLock must be acquired when accessing fields stored by
   140  	// SaveState. SaveState is called asynchronously to TaskRunner.Run by
   141  	// AllocRunner, so all state fields must be synchronized using this
   142  	// lock.
   143  	persistLock sync.Mutex
   144  }
   145  
   146  // taskRunnerState is used to snapshot the state of the task runner
   147  type taskRunnerState struct {
   148  	Version            string
   149  	Task               *structs.Task
   150  	HandleID           string
   151  	ArtifactDownloaded bool
   152  	TaskDirBuilt       bool
   153  	CreatedResources   *driver.CreatedResources
   154  	PayloadRendered    bool
   155  }
   156  
   157  // TaskStateUpdater is used to signal that tasks state has changed.
   158  type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent)
   159  
   160  // SignalEvent is a tuple of the signal and the event generating it
   161  type SignalEvent struct {
   162  	// s is the signal to be sent
   163  	s os.Signal
   164  
   165  	// e is the task event generating the signal
   166  	e *structs.TaskEvent
   167  
   168  	// result should be used to send back the result of the signal
   169  	result chan<- error
   170  }
   171  
   172  // NewTaskRunner is used to create a new task context
   173  func NewTaskRunner(logger *log.Logger, config *config.Config,
   174  	updater TaskStateUpdater, taskDir *allocdir.TaskDir,
   175  	alloc *structs.Allocation, task *structs.Task,
   176  	vaultClient vaultclient.VaultClient) *TaskRunner {
   177  
   178  	// Merge in the task resources
   179  	task.Resources = alloc.TaskResources[task.Name]
   180  
   181  	// Build the restart tracker.
   182  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   183  	if tg == nil {
   184  		logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup)
   185  		return nil
   186  	}
   187  	restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type)
   188  
   189  	tc := &TaskRunner{
   190  		config:           config,
   191  		updater:          updater,
   192  		logger:           logger,
   193  		restartTracker:   restartTracker,
   194  		alloc:            alloc,
   195  		task:             task,
   196  		taskDir:          taskDir,
   197  		createdResources: driver.NewCreatedResources(),
   198  		vaultClient:      vaultClient,
   199  		vaultFuture:      NewTokenFuture().Set(""),
   200  		updateCh:         make(chan *structs.Allocation, 64),
   201  		destroyCh:        make(chan struct{}),
   202  		waitCh:           make(chan struct{}),
   203  		startCh:          make(chan struct{}, 1),
   204  		unblockCh:        make(chan struct{}),
   205  		restartCh:        make(chan *structs.TaskEvent),
   206  		signalCh:         make(chan SignalEvent),
   207  	}
   208  
   209  	return tc
   210  }
   211  
   212  // MarkReceived marks the task as received.
   213  func (r *TaskRunner) MarkReceived() {
   214  	r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived))
   215  }
   216  
   217  // WaitCh returns a channel to wait for termination
   218  func (r *TaskRunner) WaitCh() <-chan struct{} {
   219  	return r.waitCh
   220  }
   221  
   222  // stateFilePath returns the path to our state file
   223  func (r *TaskRunner) stateFilePath() string {
   224  	// Get the MD5 of the task name
   225  	hashVal := md5.Sum([]byte(r.task.Name))
   226  	hashHex := hex.EncodeToString(hashVal[:])
   227  	dirName := fmt.Sprintf("task-%s", hashHex)
   228  
   229  	// Generate the path
   230  	path := filepath.Join(r.config.StateDir, "alloc", r.alloc.ID,
   231  		dirName, "state.json")
   232  	return path
   233  }
   234  
   235  // RestoreState is used to restore our state
   236  func (r *TaskRunner) RestoreState() error {
   237  	// Load the snapshot
   238  	var snap taskRunnerState
   239  	if err := restoreState(r.stateFilePath(), &snap); err != nil {
   240  		return err
   241  	}
   242  
   243  	// Restore fields
   244  	if snap.Task == nil {
   245  		return fmt.Errorf("task runner snapshot includes nil Task")
   246  	} else {
   247  		r.task = snap.Task
   248  	}
   249  	r.artifactsDownloaded = snap.ArtifactDownloaded
   250  	r.taskDirBuilt = snap.TaskDirBuilt
   251  	r.payloadRendered = snap.PayloadRendered
   252  
   253  	r.setCreatedResources(snap.CreatedResources)
   254  
   255  	if err := r.setTaskEnv(); err != nil {
   256  		return fmt.Errorf("client: failed to create task environment for task %q in allocation %q: %v",
   257  			r.task.Name, r.alloc.ID, err)
   258  	}
   259  
   260  	if r.task.Vault != nil {
   261  		// Read the token from the secret directory
   262  		tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   263  		data, err := ioutil.ReadFile(tokenPath)
   264  		if err != nil {
   265  			if !os.IsNotExist(err) {
   266  				return fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   267  			}
   268  
   269  			// Token file doesn't exist
   270  		} else {
   271  			// Store the recovered token
   272  			r.recoveredVaultToken = string(data)
   273  		}
   274  	}
   275  
   276  	// Restore the driver
   277  	if snap.HandleID != "" {
   278  		d, err := r.createDriver()
   279  		if err != nil {
   280  			return err
   281  		}
   282  
   283  		ctx := driver.NewExecContext(r.taskDir)
   284  		handle, err := d.Open(ctx, snap.HandleID)
   285  
   286  		// In the case it fails, we relaunch the task in the Run() method.
   287  		if err != nil {
   288  			r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v",
   289  				r.task.Name, r.alloc.ID, err)
   290  			return nil
   291  		}
   292  		r.handleLock.Lock()
   293  		r.handle = handle
   294  		r.handleLock.Unlock()
   295  
   296  		r.runningLock.Lock()
   297  		r.running = true
   298  		r.runningLock.Unlock()
   299  	}
   300  	return nil
   301  }
   302  
   303  // SaveState is used to snapshot our state
   304  func (r *TaskRunner) SaveState() error {
   305  	r.persistLock.Lock()
   306  	defer r.persistLock.Unlock()
   307  
   308  	snap := taskRunnerState{
   309  		Task:               r.task,
   310  		Version:            r.config.Version,
   311  		ArtifactDownloaded: r.artifactsDownloaded,
   312  		TaskDirBuilt:       r.taskDirBuilt,
   313  		PayloadRendered:    r.payloadRendered,
   314  		CreatedResources:   r.getCreatedResources(),
   315  	}
   316  
   317  	r.handleLock.Lock()
   318  	if r.handle != nil {
   319  		snap.HandleID = r.handle.ID()
   320  	}
   321  	r.handleLock.Unlock()
   322  	return persistState(r.stateFilePath(), &snap)
   323  }
   324  
   325  // DestroyState is used to cleanup after ourselves
   326  func (r *TaskRunner) DestroyState() error {
   327  	r.persistLock.Lock()
   328  	defer r.persistLock.Unlock()
   329  
   330  	return os.RemoveAll(r.stateFilePath())
   331  }
   332  
   333  // setState is used to update the state of the task runner
   334  func (r *TaskRunner) setState(state string, event *structs.TaskEvent) {
   335  	// Persist our state to disk.
   336  	if err := r.SaveState(); err != nil {
   337  		r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err)
   338  	}
   339  
   340  	// Indicate the task has been updated.
   341  	r.updater(r.task.Name, state, event)
   342  }
   343  
   344  // setTaskEnv sets the task environment. It returns an error if it could not be
   345  // created.
   346  func (r *TaskRunner) setTaskEnv() error {
   347  	r.taskEnvLock.Lock()
   348  	defer r.taskEnvLock.Unlock()
   349  
   350  	taskEnv, err := driver.GetTaskEnv(r.taskDir, r.config.Node,
   351  		r.task.Copy(), r.alloc, r.config, r.vaultFuture.Get())
   352  	if err != nil {
   353  		return err
   354  	}
   355  	r.taskEnv = taskEnv
   356  	return nil
   357  }
   358  
   359  // getTaskEnv returns the task environment
   360  func (r *TaskRunner) getTaskEnv() *env.TaskEnvironment {
   361  	r.taskEnvLock.Lock()
   362  	defer r.taskEnvLock.Unlock()
   363  	return r.taskEnv
   364  }
   365  
   366  // createDriver makes a driver for the task
   367  func (r *TaskRunner) createDriver() (driver.Driver, error) {
   368  	env := r.getTaskEnv()
   369  	if env == nil {
   370  		return nil, fmt.Errorf("task environment not made for task %q in allocation %q", r.task.Name, r.alloc.ID)
   371  	}
   372  
   373  	// Create a task-specific event emitter callback to expose minimal
   374  	// state to drivers
   375  	eventEmitter := func(m string, args ...interface{}) {
   376  		msg := fmt.Sprintf(m, args...)
   377  		r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg)
   378  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg))
   379  	}
   380  
   381  	driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, env, eventEmitter)
   382  	driver, err := driver.NewDriver(r.task.Driver, driverCtx)
   383  	if err != nil {
   384  		return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
   385  			r.task.Driver, r.alloc.ID, err)
   386  	}
   387  	return driver, err
   388  }
   389  
   390  // Run is a long running routine used to manage the task
   391  func (r *TaskRunner) Run() {
   392  	defer close(r.waitCh)
   393  	r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
   394  		r.task.Name, r.alloc.ID)
   395  
   396  	// Create the initial environment, this will be recreated if a Vault token
   397  	// is needed
   398  	if err := r.setTaskEnv(); err != nil {
   399  		r.setState(
   400  			structs.TaskStateDead,
   401  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err))
   402  		return
   403  	}
   404  
   405  	if err := r.validateTask(); err != nil {
   406  		r.setState(
   407  			structs.TaskStateDead,
   408  			structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask())
   409  		return
   410  	}
   411  
   412  	// Create a driver so that we can determine the FSIsolation required
   413  	drv, err := r.createDriver()
   414  	if err != nil {
   415  		e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   416  		r.setState(
   417  			structs.TaskStateDead,
   418  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask())
   419  		return
   420  	}
   421  
   422  	// Build base task directory structure regardless of FS isolation abilities.
   423  	// This needs to happen before we start the Vault manager and call prestart
   424  	// as both those can write to the task directories
   425  	if err := r.buildTaskDir(drv.FSIsolation()); err != nil {
   426  		e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err)
   427  		r.setState(
   428  			structs.TaskStateDead,
   429  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask())
   430  		return
   431  	}
   432  
   433  	// If there is no Vault policy leave the static future created in
   434  	// NewTaskRunner
   435  	if r.task.Vault != nil {
   436  		// Start the go-routine to get a Vault token
   437  		r.vaultFuture.Clear()
   438  		go r.vaultManager(r.recoveredVaultToken)
   439  	}
   440  
   441  	// Start the run loop
   442  	r.run()
   443  
   444  	// Do any cleanup necessary
   445  	r.postrun()
   446  
   447  	return
   448  }
   449  
   450  // validateTask validates the fields of the task and returns an error if the
   451  // task is invalid.
   452  func (r *TaskRunner) validateTask() error {
   453  	var mErr multierror.Error
   454  
   455  	// Validate the user.
   456  	unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist)
   457  	checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers)
   458  	if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch {
   459  		if _, unallowed := unallowedUsers[r.task.User]; unallowed {
   460  			mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User))
   461  		}
   462  	}
   463  
   464  	// Validate the artifacts
   465  	for i, artifact := range r.task.Artifacts {
   466  		// Verify the artifact doesn't escape the task directory.
   467  		if err := artifact.Validate(); err != nil {
   468  			// If this error occurs there is potentially a server bug or
   469  			// mallicious, server spoofing.
   470  			r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v",
   471  				r.alloc.ID, r.task.Name, artifact, i, err)
   472  			mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err))
   473  		}
   474  	}
   475  
   476  	// Validate the Service names
   477  	for i, service := range r.task.Services {
   478  		name := r.taskEnv.ReplaceEnv(service.Name)
   479  		if err := service.ValidateName(name); err != nil {
   480  			mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err))
   481  		}
   482  	}
   483  
   484  	if len(mErr.Errors) == 1 {
   485  		return mErr.Errors[0]
   486  	}
   487  	return mErr.ErrorOrNil()
   488  }
   489  
   490  // tokenFuture stores the Vault token and allows consumers to block till a valid
   491  // token exists
   492  type tokenFuture struct {
   493  	waiting []chan struct{}
   494  	token   string
   495  	set     bool
   496  	m       sync.Mutex
   497  }
   498  
   499  // NewTokenFuture returns a new token future without any token set
   500  func NewTokenFuture() *tokenFuture {
   501  	return &tokenFuture{}
   502  }
   503  
   504  // Wait returns a channel that can be waited on. When this channel unblocks, a
   505  // valid token will be available via the Get method
   506  func (f *tokenFuture) Wait() <-chan struct{} {
   507  	f.m.Lock()
   508  	defer f.m.Unlock()
   509  
   510  	c := make(chan struct{})
   511  	if f.set {
   512  		close(c)
   513  		return c
   514  	}
   515  
   516  	f.waiting = append(f.waiting, c)
   517  	return c
   518  }
   519  
   520  // Set sets the token value and unblocks any caller of Wait
   521  func (f *tokenFuture) Set(token string) *tokenFuture {
   522  	f.m.Lock()
   523  	defer f.m.Unlock()
   524  
   525  	f.set = true
   526  	f.token = token
   527  	for _, w := range f.waiting {
   528  		close(w)
   529  	}
   530  	f.waiting = nil
   531  	return f
   532  }
   533  
   534  // Clear clears the set vault token.
   535  func (f *tokenFuture) Clear() *tokenFuture {
   536  	f.m.Lock()
   537  	defer f.m.Unlock()
   538  
   539  	f.token = ""
   540  	f.set = false
   541  	return f
   542  }
   543  
   544  // Get returns the set Vault token
   545  func (f *tokenFuture) Get() string {
   546  	f.m.Lock()
   547  	defer f.m.Unlock()
   548  	return f.token
   549  }
   550  
   551  // vaultManager should be called in a go-routine and manages the derivation,
   552  // renewal and handling of errors with the Vault token. The optional parameter
   553  // allows setting the initial Vault token. This is useful when the Vault token
   554  // is recovered off disk.
   555  func (r *TaskRunner) vaultManager(token string) {
   556  	// Helper for stopping token renewal
   557  	stopRenewal := func() {
   558  		if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil {
   559  			r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   560  		}
   561  	}
   562  
   563  	// updatedToken lets us store state between loops. If true, a new token
   564  	// has been retrieved and we need to apply the Vault change mode
   565  	var updatedToken bool
   566  
   567  OUTER:
   568  	for {
   569  		// Check if we should exit
   570  		select {
   571  		case <-r.waitCh:
   572  			stopRenewal()
   573  			return
   574  		default:
   575  		}
   576  
   577  		// Clear the token
   578  		r.vaultFuture.Clear()
   579  
   580  		// Check if there already is a token which can be the case for
   581  		// restoring the TaskRunner
   582  		if token == "" {
   583  			// Get a token
   584  			var exit bool
   585  			token, exit = r.deriveVaultToken()
   586  			if exit {
   587  				// Exit the manager
   588  				return
   589  			}
   590  
   591  			// Write the token to disk
   592  			if err := r.writeToken(token); err != nil {
   593  				e := fmt.Errorf("failed to write Vault token to disk")
   594  				r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err)
   595  				r.Kill("vault", e.Error(), true)
   596  				return
   597  			}
   598  		}
   599  
   600  		// Start the renewal process
   601  		renewCh, err := r.vaultClient.RenewToken(token, 30)
   602  
   603  		// An error returned means the token is not being renewed
   604  		if err != nil {
   605  			r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   606  			token = ""
   607  			goto OUTER
   608  		}
   609  
   610  		// The Vault token is valid now, so set it
   611  		r.vaultFuture.Set(token)
   612  
   613  		if updatedToken {
   614  			switch r.task.Vault.ChangeMode {
   615  			case structs.VaultChangeModeSignal:
   616  				s, err := signals.Parse(r.task.Vault.ChangeSignal)
   617  				if err != nil {
   618  					e := fmt.Errorf("failed to parse signal: %v", err)
   619  					r.logger.Printf("[ERR] client: %v", err)
   620  					r.Kill("vault", e.Error(), true)
   621  					return
   622  				}
   623  
   624  				if err := r.Signal("vault", "new Vault token acquired", s); err != nil {
   625  					r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   626  					r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true)
   627  					return
   628  				}
   629  			case structs.VaultChangeModeRestart:
   630  				r.Restart("vault", "new Vault token acquired")
   631  			case structs.VaultChangeModeNoop:
   632  				fallthrough
   633  			default:
   634  				r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode)
   635  			}
   636  
   637  			// We have handled it
   638  			updatedToken = false
   639  
   640  			// Call the handler
   641  			r.updatedTokenHandler()
   642  		}
   643  
   644  		// Start watching for renewal errors
   645  		select {
   646  		case err := <-renewCh:
   647  			// Clear the token
   648  			token = ""
   649  			r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   650  			stopRenewal()
   651  
   652  			// Check if we have to do anything
   653  			if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop {
   654  				updatedToken = true
   655  			}
   656  		case <-r.waitCh:
   657  			stopRenewal()
   658  			return
   659  		}
   660  	}
   661  }
   662  
   663  // deriveVaultToken derives the Vault token using exponential backoffs. It
   664  // returns the Vault token and whether the manager should exit.
   665  func (r *TaskRunner) deriveVaultToken() (token string, exit bool) {
   666  	attempts := 0
   667  	for {
   668  		tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name})
   669  		if err == nil {
   670  			return tokens[r.task.Name], false
   671  		}
   672  
   673  		// Check if we can't recover from the error
   674  		if !structs.IsRecoverable(err) {
   675  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   676  				r.task.Name, r.alloc.ID, err)
   677  			r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true)
   678  			return "", true
   679  		}
   680  
   681  		// Handle the retry case
   682  		backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
   683  		if backoff > vaultBackoffLimit {
   684  			backoff = vaultBackoffLimit
   685  		}
   686  		r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v",
   687  			r.task.Name, r.alloc.ID, err, backoff)
   688  
   689  		attempts++
   690  
   691  		// Wait till retrying
   692  		select {
   693  		case <-r.waitCh:
   694  			return "", true
   695  		case <-time.After(backoff):
   696  		}
   697  	}
   698  }
   699  
   700  // writeToken writes the given token to disk
   701  func (r *TaskRunner) writeToken(token string) error {
   702  	tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   703  	if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil {
   704  		return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   705  	}
   706  
   707  	return nil
   708  }
   709  
   710  // updatedTokenHandler is called when a new Vault token is retrieved. Things
   711  // that rely on the token should be updated here.
   712  func (r *TaskRunner) updatedTokenHandler() {
   713  
   714  	// Update the tasks environment
   715  	if err := r.setTaskEnv(); err != nil {
   716  		r.setState(
   717  			structs.TaskStateDead,
   718  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   719  		return
   720  	}
   721  
   722  	if r.templateManager != nil {
   723  		r.templateManager.Stop()
   724  
   725  		// Create a new templateManager
   726  		var err error
   727  		r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates,
   728  			r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv())
   729  		if err != nil {
   730  			err := fmt.Errorf("failed to build task's template manager: %v", err)
   731  			r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   732  			r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
   733  			r.Kill("vault", err.Error(), true)
   734  			return
   735  		}
   736  	}
   737  }
   738  
   739  // prestart handles life-cycle tasks that occur before the task has started.
   740  func (r *TaskRunner) prestart(resultCh chan bool) {
   741  	if r.task.Vault != nil {
   742  		// Wait for the token
   743  		r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID)
   744  		tokenCh := r.vaultFuture.Wait()
   745  		select {
   746  		case <-tokenCh:
   747  		case <-r.waitCh:
   748  			resultCh <- false
   749  			return
   750  		}
   751  		r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID)
   752  	}
   753  
   754  	if err := r.setTaskEnv(); err != nil {
   755  		r.setState(
   756  			structs.TaskStateDead,
   757  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   758  		resultCh <- false
   759  		return
   760  	}
   761  
   762  	// If the job is a dispatch job and there is a payload write it to disk
   763  	requirePayload := len(r.alloc.Job.Payload) != 0 &&
   764  		(r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "")
   765  	if !r.payloadRendered && requirePayload {
   766  		renderTo := filepath.Join(r.taskDir.LocalDir, r.task.DispatchPayload.File)
   767  		decoded, err := snappy.Decode(nil, r.alloc.Job.Payload)
   768  		if err != nil {
   769  			r.setState(
   770  				structs.TaskStateDead,
   771  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   772  			resultCh <- false
   773  			return
   774  		}
   775  
   776  		if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil {
   777  			r.setState(
   778  				structs.TaskStateDead,
   779  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   780  			resultCh <- false
   781  			return
   782  		}
   783  
   784  		if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil {
   785  			r.setState(
   786  				structs.TaskStateDead,
   787  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   788  			resultCh <- false
   789  			return
   790  		}
   791  
   792  		r.payloadRendered = true
   793  	}
   794  
   795  	for {
   796  		r.persistLock.Lock()
   797  		downloaded := r.artifactsDownloaded
   798  		r.persistLock.Unlock()
   799  
   800  		// Download the task's artifacts
   801  		if !downloaded && len(r.task.Artifacts) > 0 {
   802  			r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts))
   803  			for _, artifact := range r.task.Artifacts {
   804  				if err := getter.GetArtifact(r.getTaskEnv(), artifact, r.taskDir.Dir); err != nil {
   805  					wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err)
   806  					r.logger.Printf("[DEBUG] client: %v", wrapped)
   807  					r.setState(structs.TaskStatePending,
   808  						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped))
   809  					r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err))
   810  					goto RESTART
   811  				}
   812  			}
   813  
   814  			r.persistLock.Lock()
   815  			r.artifactsDownloaded = true
   816  			r.persistLock.Unlock()
   817  		}
   818  
   819  		// We don't have to wait for any template
   820  		if len(r.task.Templates) == 0 {
   821  			// Send the start signal
   822  			select {
   823  			case r.startCh <- struct{}{}:
   824  			default:
   825  			}
   826  
   827  			resultCh <- true
   828  			return
   829  		}
   830  
   831  		// Build the template manager
   832  		if r.templateManager == nil {
   833  			var err error
   834  			r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates,
   835  				r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv())
   836  			if err != nil {
   837  				err := fmt.Errorf("failed to build task's template manager: %v", err)
   838  				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   839  				r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
   840  				resultCh <- false
   841  				return
   842  			}
   843  		}
   844  
   845  		// Block for consul-template
   846  		// TODO Hooks should register themselves as blocking and then we can
   847  		// perioidcally enumerate what we are still blocked on
   848  		select {
   849  		case <-r.unblockCh:
   850  			// Send the start signal
   851  			select {
   852  			case r.startCh <- struct{}{}:
   853  			default:
   854  			}
   855  
   856  			resultCh <- true
   857  			return
   858  		case <-r.waitCh:
   859  			// The run loop has exited so exit too
   860  			resultCh <- false
   861  			return
   862  		}
   863  
   864  	RESTART:
   865  		restart := r.shouldRestart()
   866  		if !restart {
   867  			resultCh <- false
   868  			return
   869  		}
   870  	}
   871  }
   872  
   873  // postrun is used to do any cleanup that is necessary after exiting the runloop
   874  func (r *TaskRunner) postrun() {
   875  	// Stop the template manager
   876  	if r.templateManager != nil {
   877  		r.templateManager.Stop()
   878  	}
   879  }
   880  
   881  // run is the main run loop that handles starting the application, destroying
   882  // it, restarts and signals.
   883  func (r *TaskRunner) run() {
   884  	// Predeclare things so we can jump to the RESTART
   885  	var stopCollection chan struct{}
   886  	var handleWaitCh chan *dstructs.WaitResult
   887  
   888  	// If we already have a handle, populate the stopCollection and handleWaitCh
   889  	// to fix the invariant that it exists.
   890  	r.handleLock.Lock()
   891  	handleEmpty := r.handle == nil
   892  	r.handleLock.Unlock()
   893  
   894  	if !handleEmpty {
   895  		stopCollection = make(chan struct{})
   896  		go r.collectResourceUsageStats(stopCollection)
   897  		handleWaitCh = r.handle.WaitCh()
   898  	}
   899  
   900  	for {
   901  		// Do the prestart activities
   902  		prestartResultCh := make(chan bool, 1)
   903  		go r.prestart(prestartResultCh)
   904  
   905  	WAIT:
   906  		for {
   907  			select {
   908  			case success := <-prestartResultCh:
   909  				if !success {
   910  					r.cleanup()
   911  					r.setState(structs.TaskStateDead, nil)
   912  					return
   913  				}
   914  			case <-r.startCh:
   915  				// Start the task if not yet started or it is being forced. This logic
   916  				// is necessary because in the case of a restore the handle already
   917  				// exists.
   918  				r.handleLock.Lock()
   919  				handleEmpty := r.handle == nil
   920  				r.handleLock.Unlock()
   921  				if handleEmpty {
   922  					startErr := r.startTask()
   923  					r.restartTracker.SetStartError(startErr)
   924  					if startErr != nil {
   925  						r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr))
   926  						goto RESTART
   927  					}
   928  
   929  					// Mark the task as started
   930  					r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
   931  					r.runningLock.Lock()
   932  					r.running = true
   933  					r.runningLock.Unlock()
   934  
   935  					if stopCollection == nil {
   936  						stopCollection = make(chan struct{})
   937  						go r.collectResourceUsageStats(stopCollection)
   938  					}
   939  
   940  					handleWaitCh = r.handle.WaitCh()
   941  				}
   942  
   943  			case waitRes := <-handleWaitCh:
   944  				if waitRes == nil {
   945  					panic("nil wait")
   946  				}
   947  
   948  				r.runningLock.Lock()
   949  				r.running = false
   950  				r.runningLock.Unlock()
   951  
   952  				// Stop collection of the task's resource usage
   953  				close(stopCollection)
   954  
   955  				// Log whether the task was successful or not.
   956  				r.restartTracker.SetWaitResult(waitRes)
   957  				r.setState("", r.waitErrorToEvent(waitRes))
   958  				if !waitRes.Successful() {
   959  					r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes)
   960  				} else {
   961  					r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID)
   962  				}
   963  
   964  				break WAIT
   965  			case update := <-r.updateCh:
   966  				if err := r.handleUpdate(update); err != nil {
   967  					r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err)
   968  				}
   969  
   970  			case se := <-r.signalCh:
   971  				r.runningLock.Lock()
   972  				running := r.running
   973  				r.runningLock.Unlock()
   974  				common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID)
   975  				if !running {
   976  					// Send no error
   977  					r.logger.Printf("[DEBUG] client: skipping %s", common)
   978  					se.result <- nil
   979  					continue
   980  				}
   981  
   982  				r.logger.Printf("[DEBUG] client: sending %s", common)
   983  				r.setState(structs.TaskStateRunning, se.e)
   984  
   985  				res := r.handle.Signal(se.s)
   986  				se.result <- res
   987  
   988  			case event := <-r.restartCh:
   989  				r.runningLock.Lock()
   990  				running := r.running
   991  				r.runningLock.Unlock()
   992  				common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID)
   993  				if !running {
   994  					r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common)
   995  					continue
   996  				}
   997  
   998  				r.logger.Printf("[DEBUG] client: restarting %s: %v", common, event.RestartReason)
   999  				r.setState(structs.TaskStateRunning, event)
  1000  				r.killTask(nil)
  1001  
  1002  				close(stopCollection)
  1003  
  1004  				if handleWaitCh != nil {
  1005  					<-handleWaitCh
  1006  				}
  1007  
  1008  				// Since the restart isn't from a failure, restart immediately
  1009  				// and don't count against the restart policy
  1010  				r.restartTracker.SetRestartTriggered()
  1011  				break WAIT
  1012  
  1013  			case <-r.destroyCh:
  1014  				r.runningLock.Lock()
  1015  				running := r.running
  1016  				r.runningLock.Unlock()
  1017  				if !running {
  1018  					r.cleanup()
  1019  					r.setState(structs.TaskStateDead, r.destroyEvent)
  1020  					return
  1021  				}
  1022  
  1023  				// Store the task event that provides context on the task
  1024  				// destroy. The Killed event is set from the alloc_runner and
  1025  				// doesn't add detail
  1026  				var killEvent *structs.TaskEvent
  1027  				if r.destroyEvent.Type != structs.TaskKilled {
  1028  					if r.destroyEvent.Type == structs.TaskKilling {
  1029  						killEvent = r.destroyEvent
  1030  					} else {
  1031  						r.setState(structs.TaskStateRunning, r.destroyEvent)
  1032  					}
  1033  				}
  1034  
  1035  				r.killTask(killEvent)
  1036  				close(stopCollection)
  1037  
  1038  				// Wait for handler to exit before calling cleanup
  1039  				<-handleWaitCh
  1040  				r.cleanup()
  1041  
  1042  				r.setState(structs.TaskStateDead, nil)
  1043  				return
  1044  			}
  1045  		}
  1046  
  1047  	RESTART:
  1048  		restart := r.shouldRestart()
  1049  		if !restart {
  1050  			r.cleanup()
  1051  			r.setState(structs.TaskStateDead, nil)
  1052  			return
  1053  		}
  1054  
  1055  		// Clear the handle so a new driver will be created.
  1056  		r.handleLock.Lock()
  1057  		r.handle = nil
  1058  		handleWaitCh = nil
  1059  		stopCollection = nil
  1060  		r.handleLock.Unlock()
  1061  	}
  1062  }
  1063  
  1064  // cleanup calls Driver.Cleanup when a task is stopping. Errors are logged.
  1065  func (r *TaskRunner) cleanup() {
  1066  	drv, err := r.createDriver()
  1067  	if err != nil {
  1068  		r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err)
  1069  		return
  1070  	}
  1071  
  1072  	res := r.getCreatedResources()
  1073  
  1074  	ctx := driver.NewExecContext(r.taskDir)
  1075  	attempts := 1
  1076  	var cleanupErr error
  1077  	for retry := true; retry; attempts++ {
  1078  		cleanupErr = drv.Cleanup(ctx, res)
  1079  		retry = structs.IsRecoverable(cleanupErr)
  1080  
  1081  		// Copy current createdResources state in case SaveState is
  1082  		// called between retries
  1083  		r.setCreatedResources(res)
  1084  
  1085  		// Retry 3 times with sleeps between
  1086  		if !retry || attempts > 3 {
  1087  			break
  1088  		}
  1089  		time.Sleep(time.Duration(attempts) * time.Second)
  1090  	}
  1091  
  1092  	if cleanupErr != nil {
  1093  		r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr)
  1094  	}
  1095  	return
  1096  }
  1097  
  1098  // shouldRestart returns if the task should restart. If the return value is
  1099  // true, the task's restart policy has already been considered and any wait time
  1100  // between restarts has been applied.
  1101  func (r *TaskRunner) shouldRestart() bool {
  1102  	state, when := r.restartTracker.GetState()
  1103  	reason := r.restartTracker.GetReason()
  1104  	switch state {
  1105  	case structs.TaskNotRestarting, structs.TaskTerminated:
  1106  		r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
  1107  		if state == structs.TaskNotRestarting {
  1108  			r.setState(structs.TaskStateDead,
  1109  				structs.NewTaskEvent(structs.TaskNotRestarting).
  1110  					SetRestartReason(reason).SetFailsTask())
  1111  		}
  1112  		return false
  1113  	case structs.TaskRestarting:
  1114  		r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
  1115  		r.setState(structs.TaskStatePending,
  1116  			structs.NewTaskEvent(structs.TaskRestarting).
  1117  				SetRestartDelay(when).
  1118  				SetRestartReason(reason))
  1119  	default:
  1120  		r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
  1121  		return false
  1122  	}
  1123  
  1124  	// Sleep but watch for destroy events.
  1125  	select {
  1126  	case <-time.After(when):
  1127  	case <-r.destroyCh:
  1128  	}
  1129  
  1130  	// Destroyed while we were waiting to restart, so abort.
  1131  	r.destroyLock.Lock()
  1132  	destroyed := r.destroy
  1133  	r.destroyLock.Unlock()
  1134  	if destroyed {
  1135  		r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name)
  1136  		r.setState(structs.TaskStateDead, r.destroyEvent)
  1137  		return false
  1138  	}
  1139  
  1140  	return true
  1141  }
  1142  
  1143  // killTask kills the running task. A killing event can optionally be passed and
  1144  // this event is used to mark the task as being killed. It provides a means to
  1145  // store extra information.
  1146  func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) {
  1147  	r.runningLock.Lock()
  1148  	running := r.running
  1149  	r.runningLock.Unlock()
  1150  	if !running {
  1151  		return
  1152  	}
  1153  
  1154  	// Get the kill timeout
  1155  	timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)
  1156  
  1157  	// Build the event
  1158  	var event *structs.TaskEvent
  1159  	if killingEvent != nil {
  1160  		event = killingEvent
  1161  		event.Type = structs.TaskKilling
  1162  	} else {
  1163  		event = structs.NewTaskEvent(structs.TaskKilling)
  1164  	}
  1165  	event.SetKillTimeout(timeout)
  1166  
  1167  	// Mark that we received the kill event
  1168  	r.setState(structs.TaskStateRunning, event)
  1169  
  1170  	// Kill the task using an exponential backoff in-case of failures.
  1171  	destroySuccess, err := r.handleDestroy()
  1172  	if !destroySuccess {
  1173  		// We couldn't successfully destroy the resource created.
  1174  		r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
  1175  	}
  1176  
  1177  	r.runningLock.Lock()
  1178  	r.running = false
  1179  	r.runningLock.Unlock()
  1180  
  1181  	// Store that the task has been destroyed and any associated error.
  1182  	r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err))
  1183  }
  1184  
  1185  // startTask creates the driver, task dir, and starts the task.
  1186  func (r *TaskRunner) startTask() error {
  1187  	// Create a driver
  1188  	drv, err := r.createDriver()
  1189  	if err != nil {
  1190  		return fmt.Errorf("failed to create driver of task %q for alloc %q: %v",
  1191  			r.task.Name, r.alloc.ID, err)
  1192  	}
  1193  
  1194  	// Run prestart
  1195  	ctx := driver.NewExecContext(r.taskDir)
  1196  	res, err := drv.Prestart(ctx, r.task)
  1197  
  1198  	// Merge newly created resources into previously created resources
  1199  	r.createdResourcesLock.Lock()
  1200  	r.createdResources.Merge(res)
  1201  	r.createdResourcesLock.Unlock()
  1202  
  1203  	if err != nil {
  1204  		wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v",
  1205  			r.task.Name, r.alloc.ID, err)
  1206  		r.logger.Printf("[WARN] client: error from prestart: %s", wrapped)
  1207  		return structs.WrapRecoverable(wrapped, err)
  1208  	}
  1209  
  1210  	// Start the job
  1211  	handle, err := drv.Start(ctx, r.task)
  1212  	if err != nil {
  1213  		wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v",
  1214  			r.task.Name, r.alloc.ID, err)
  1215  		r.logger.Printf("[WARN] client: %s", wrapped)
  1216  		return structs.WrapRecoverable(wrapped, err)
  1217  
  1218  	}
  1219  
  1220  	r.handleLock.Lock()
  1221  	r.handle = handle
  1222  	r.handleLock.Unlock()
  1223  	return nil
  1224  }
  1225  
  1226  // buildTaskDir creates the task directory before driver.Prestart. It is safe
  1227  // to call multiple times as its state is persisted.
  1228  func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error {
  1229  	r.persistLock.Lock()
  1230  	built := r.taskDirBuilt
  1231  	r.persistLock.Unlock()
  1232  
  1233  	// We do not set the state again since this only occurs during restoration
  1234  	// and the task dir is already built. The reason we call Build again is to
  1235  	// ensure that the task dir invariants are still held.
  1236  	if !built {
  1237  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskSetup).
  1238  			SetMessage(structs.TaskBuildingTaskDir))
  1239  	}
  1240  
  1241  	chroot := config.DefaultChrootEnv
  1242  	if len(r.config.ChrootEnv) > 0 {
  1243  		chroot = r.config.ChrootEnv
  1244  	}
  1245  	if err := r.taskDir.Build(built, chroot, fsi); err != nil {
  1246  		return err
  1247  	}
  1248  
  1249  	// Mark task dir as successfully built
  1250  	r.persistLock.Lock()
  1251  	r.taskDirBuilt = true
  1252  	r.persistLock.Unlock()
  1253  	return nil
  1254  }
  1255  
  1256  // collectResourceUsageStats starts collecting resource usage stats of a Task.
  1257  // Collection ends when the passed channel is closed
  1258  func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) {
  1259  	// start collecting the stats right away and then start collecting every
  1260  	// collection interval
  1261  	next := time.NewTimer(0)
  1262  	defer next.Stop()
  1263  	for {
  1264  		select {
  1265  		case <-next.C:
  1266  			next.Reset(r.config.StatsCollectionInterval)
  1267  			if r.handle == nil {
  1268  				continue
  1269  			}
  1270  			ru, err := r.handle.Stats()
  1271  
  1272  			if err != nil {
  1273  				// Check if the driver doesn't implement stats
  1274  				if err.Error() == driver.DriverStatsNotImplemented.Error() {
  1275  					r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID)
  1276  					return
  1277  				}
  1278  
  1279  				// We do not log when the plugin is shutdown as this is simply a
  1280  				// race between the stopCollection channel being closed and calling
  1281  				// Stats on the handle.
  1282  				if !strings.Contains(err.Error(), "connection is shut down") {
  1283  					r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err)
  1284  				}
  1285  				continue
  1286  			}
  1287  
  1288  			r.resourceUsageLock.Lock()
  1289  			r.resourceUsage = ru
  1290  			r.resourceUsageLock.Unlock()
  1291  			if ru != nil {
  1292  				r.emitStats(ru)
  1293  			}
  1294  		case <-stopCollection:
  1295  			return
  1296  		}
  1297  	}
  1298  }
  1299  
  1300  // LatestResourceUsage returns the last resource utilization datapoint collected
  1301  func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1302  	r.resourceUsageLock.RLock()
  1303  	defer r.resourceUsageLock.RUnlock()
  1304  	r.runningLock.Lock()
  1305  	defer r.runningLock.Unlock()
  1306  
  1307  	// If the task is not running there can be no latest resource
  1308  	if !r.running {
  1309  		return nil
  1310  	}
  1311  
  1312  	return r.resourceUsage
  1313  }
  1314  
  1315  // handleUpdate takes an updated allocation and updates internal state to
  1316  // reflect the new config for the task.
  1317  func (r *TaskRunner) handleUpdate(update *structs.Allocation) error {
  1318  	// Extract the task group from the alloc.
  1319  	tg := update.Job.LookupTaskGroup(update.TaskGroup)
  1320  	if tg == nil {
  1321  		return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup)
  1322  	}
  1323  
  1324  	// Extract the task.
  1325  	var updatedTask *structs.Task
  1326  	for _, t := range tg.Tasks {
  1327  		if t.Name == r.task.Name {
  1328  			updatedTask = t.Copy()
  1329  		}
  1330  	}
  1331  	if updatedTask == nil {
  1332  		return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name)
  1333  	}
  1334  
  1335  	// Merge in the task resources
  1336  	updatedTask.Resources = update.TaskResources[updatedTask.Name]
  1337  
  1338  	// Update will update resources and store the new kill timeout.
  1339  	var mErr multierror.Error
  1340  	r.handleLock.Lock()
  1341  	if r.handle != nil {
  1342  		if err := r.handle.Update(updatedTask); err != nil {
  1343  			mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err))
  1344  		}
  1345  	}
  1346  	r.handleLock.Unlock()
  1347  
  1348  	// Update the restart policy.
  1349  	if r.restartTracker != nil {
  1350  		r.restartTracker.SetPolicy(tg.RestartPolicy)
  1351  	}
  1352  
  1353  	// Store the updated alloc.
  1354  	r.alloc = update
  1355  	r.task = updatedTask
  1356  	return mErr.ErrorOrNil()
  1357  }
  1358  
  1359  // handleDestroy kills the task handle. In the case that killing fails,
  1360  // handleDestroy will retry with an exponential backoff and will give up at a
  1361  // given limit. It returns whether the task was destroyed and the error
  1362  // associated with the last kill attempt.
  1363  func (r *TaskRunner) handleDestroy() (destroyed bool, err error) {
  1364  	// Cap the number of times we attempt to kill the task.
  1365  	for i := 0; i < killFailureLimit; i++ {
  1366  		if err = r.handle.Kill(); err != nil {
  1367  			// Calculate the new backoff
  1368  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
  1369  			if backoff > killBackoffLimit {
  1370  				backoff = killBackoffLimit
  1371  			}
  1372  
  1373  			r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v",
  1374  				r.task.Name, r.alloc.ID, backoff, err)
  1375  			time.Sleep(time.Duration(backoff))
  1376  		} else {
  1377  			// Kill was successful
  1378  			return true, nil
  1379  		}
  1380  	}
  1381  	return
  1382  }
  1383  
  1384  // Restart will restart the task
  1385  func (r *TaskRunner) Restart(source, reason string) {
  1386  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1387  	event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr)
  1388  
  1389  	select {
  1390  	case r.restartCh <- event:
  1391  	case <-r.waitCh:
  1392  	}
  1393  }
  1394  
  1395  // Signal will send a signal to the task
  1396  func (r *TaskRunner) Signal(source, reason string, s os.Signal) error {
  1397  
  1398  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1399  	event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr)
  1400  
  1401  	resCh := make(chan error)
  1402  	se := SignalEvent{
  1403  		s:      s,
  1404  		e:      event,
  1405  		result: resCh,
  1406  	}
  1407  
  1408  	select {
  1409  	case r.signalCh <- se:
  1410  	case <-r.waitCh:
  1411  	}
  1412  
  1413  	return <-resCh
  1414  }
  1415  
  1416  // Kill will kill a task and store the error, no longer restarting the task. If
  1417  // fail is set, the task is marked as having failed.
  1418  func (r *TaskRunner) Kill(source, reason string, fail bool) {
  1419  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1420  	event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr)
  1421  	if fail {
  1422  		event.SetFailsTask()
  1423  	}
  1424  
  1425  	r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr)
  1426  	r.Destroy(event)
  1427  }
  1428  
  1429  // UnblockStart unblocks the starting of the task. It currently assumes only
  1430  // consul-template will unblock
  1431  func (r *TaskRunner) UnblockStart(source string) {
  1432  	r.unblockLock.Lock()
  1433  	defer r.unblockLock.Unlock()
  1434  	if r.unblocked {
  1435  		return
  1436  	}
  1437  
  1438  	r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source)
  1439  	r.unblocked = true
  1440  	close(r.unblockCh)
  1441  }
  1442  
  1443  // Helper function for converting a WaitResult into a TaskTerminated event.
  1444  func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent {
  1445  	return structs.NewTaskEvent(structs.TaskTerminated).
  1446  		SetExitCode(res.ExitCode).
  1447  		SetSignal(res.Signal).
  1448  		SetExitMessage(res.Err)
  1449  }
  1450  
  1451  // Update is used to update the task of the context
  1452  func (r *TaskRunner) Update(update *structs.Allocation) {
  1453  	select {
  1454  	case r.updateCh <- update:
  1455  	default:
  1456  		r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
  1457  			r.task.Name, r.alloc.ID)
  1458  	}
  1459  }
  1460  
  1461  // Destroy is used to indicate that the task context should be destroyed. The
  1462  // event parameter provides a context for the destroy.
  1463  func (r *TaskRunner) Destroy(event *structs.TaskEvent) {
  1464  	r.destroyLock.Lock()
  1465  	defer r.destroyLock.Unlock()
  1466  
  1467  	if r.destroy {
  1468  		return
  1469  	}
  1470  	r.destroy = true
  1471  	r.destroyEvent = event
  1472  	close(r.destroyCh)
  1473  }
  1474  
  1475  // getCreatedResources returns the resources created by drivers. It will never
  1476  // return nil.
  1477  func (r *TaskRunner) getCreatedResources() *driver.CreatedResources {
  1478  	r.createdResourcesLock.Lock()
  1479  	if r.createdResources == nil {
  1480  		r.createdResources = driver.NewCreatedResources()
  1481  	}
  1482  	cr := r.createdResources.Copy()
  1483  	r.createdResourcesLock.Unlock()
  1484  
  1485  	return cr
  1486  }
  1487  
  1488  // setCreatedResources updates the resources created by drivers. If passed nil
  1489  // it will set createdResources to an initialized struct.
  1490  func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) {
  1491  	if cr == nil {
  1492  		cr = driver.NewCreatedResources()
  1493  	}
  1494  	r.createdResourcesLock.Lock()
  1495  	r.createdResources = cr.Copy()
  1496  	r.createdResourcesLock.Unlock()
  1497  }
  1498  
  1499  // emitStats emits resource usage stats of tasks to remote metrics collector
  1500  // sinks
  1501  func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1502  	if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics {
  1503  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1504  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1505  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1506  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1507  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1508  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1509  	}
  1510  
  1511  	if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics {
  1512  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1513  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1514  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1515  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1516  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1517  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1518  	}
  1519  }