github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/task_runner.go

github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/task_runner.go (about)

     1  package client
     2  
     3  import (
     4  	"bytes"
     5  	"crypto/md5"
     6  	"encoding/hex"
     7  	"fmt"
     8  	"io"
     9  	"io/ioutil"
    10  	"log"
    11  	"os"
    12  	"path/filepath"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/armon/go-metrics"
    18  	"github.com/boltdb/bolt"
    19  	"github.com/golang/snappy"
    20  	"github.com/hashicorp/consul-template/signals"
    21  	"github.com/hashicorp/go-multierror"
    22  	version "github.com/hashicorp/go-version"
    23  	"github.com/hashicorp/nomad/client/allocdir"
    24  	"github.com/hashicorp/nomad/client/config"
    25  	"github.com/hashicorp/nomad/client/driver"
    26  	"github.com/hashicorp/nomad/client/getter"
    27  	"github.com/hashicorp/nomad/client/vaultclient"
    28  	"github.com/hashicorp/nomad/nomad/structs"
    29  	"github.com/ugorji/go/codec"
    30  
    31  	"github.com/hashicorp/nomad/client/driver/env"
    32  	dstructs "github.com/hashicorp/nomad/client/driver/structs"
    33  	cstructs "github.com/hashicorp/nomad/client/structs"
    34  )
    35  
    36  const (
    37  	// killBackoffBaseline is the baseline time for exponential backoff while
    38  	// killing a task.
    39  	killBackoffBaseline = 5 * time.Second
    40  
    41  	// killBackoffLimit is the limit of the exponential backoff for killing
    42  	// the task.
    43  	killBackoffLimit = 2 * time.Minute
    44  
    45  	// killFailureLimit is how many times we will attempt to kill a task before
    46  	// giving up and potentially leaking resources.
    47  	killFailureLimit = 5
    48  
    49  	// vaultBackoffBaseline is the baseline time for exponential backoff when
    50  	// attempting to retrieve a Vault token
    51  	vaultBackoffBaseline = 5 * time.Second
    52  
    53  	// vaultBackoffLimit is the limit of the exponential backoff when attempting
    54  	// to retrieve a Vault token
    55  	vaultBackoffLimit = 3 * time.Minute
    56  
    57  	// vaultTokenFile is the name of the file holding the Vault token inside the
    58  	// task's secret directory
    59  	vaultTokenFile = "vault_token"
    60  )
    61  
    62  var (
    63  	// taskRunnerStateAllKey holds all the task runners state. At the moment
    64  	// there is no need to split it
    65  	taskRunnerStateAllKey = []byte("simple-all")
    66  )
    67  
    68  // TaskRunner is used to wrap a task within an allocation and provide the execution context.
    69  type TaskRunner struct {
    70  	stateDB        *bolt.DB
    71  	config         *config.Config
    72  	updater        TaskStateUpdater
    73  	logger         *log.Logger
    74  	alloc          *structs.Allocation
    75  	restartTracker *RestartTracker
    76  	consul         ConsulServiceAPI
    77  
    78  	// running marks whether the task is running
    79  	running     bool
    80  	runningLock sync.Mutex
    81  
    82  	resourceUsage     *cstructs.TaskResourceUsage
    83  	resourceUsageLock sync.RWMutex
    84  
    85  	task    *structs.Task
    86  	taskDir *allocdir.TaskDir
    87  
    88  	// envBuilder is used to build the task's environment
    89  	envBuilder *env.Builder
    90  
    91  	// driverNet is the network information returned by the driver
    92  	driverNet     *cstructs.DriverNetwork
    93  	driverNetLock sync.Mutex
    94  
    95  	// updateCh is used to receive updated versions of the allocation
    96  	updateCh chan *structs.Allocation
    97  
    98  	handle     driver.DriverHandle
    99  	handleLock sync.Mutex
   100  
   101  	// artifactsDownloaded tracks whether the tasks artifacts have been
   102  	// downloaded
   103  	//
   104  	// Must acquire persistLock when accessing
   105  	artifactsDownloaded bool
   106  
   107  	// taskDirBuilt tracks whether the task has built its directory.
   108  	//
   109  	// Must acquire persistLock when accessing
   110  	taskDirBuilt bool
   111  
   112  	// createdResources are all the resources created by the task driver
   113  	// across all attempts to start the task.
   114  	// Simple gets and sets should use {get,set}CreatedResources
   115  	createdResources     *driver.CreatedResources
   116  	createdResourcesLock sync.Mutex
   117  
   118  	// payloadRendered tracks whether the payload has been rendered to disk
   119  	payloadRendered bool
   120  
   121  	// vaultFuture is the means to wait for and get a Vault token
   122  	vaultFuture *tokenFuture
   123  
   124  	// recoveredVaultToken is the token that was recovered through a restore
   125  	recoveredVaultToken string
   126  
   127  	// vaultClient is used to retrieve and renew any needed Vault token
   128  	vaultClient vaultclient.VaultClient
   129  
   130  	// templateManager is used to manage any consul-templates this task may have
   131  	templateManager *TaskTemplateManager
   132  
   133  	// startCh is used to trigger the start of the task
   134  	startCh chan struct{}
   135  
   136  	// unblockCh is used to unblock the starting of the task
   137  	unblockCh   chan struct{}
   138  	unblocked   bool
   139  	unblockLock sync.Mutex
   140  
   141  	// restartCh is used to restart a task
   142  	restartCh chan *structs.TaskEvent
   143  
   144  	// signalCh is used to send a signal to a task
   145  	signalCh chan SignalEvent
   146  
   147  	destroy      bool
   148  	destroyCh    chan struct{}
   149  	destroyLock  sync.Mutex
   150  	destroyEvent *structs.TaskEvent
   151  
   152  	// waitCh closing marks the run loop as having exited
   153  	waitCh chan struct{}
   154  
   155  	// persistLock must be acquired when accessing fields stored by
   156  	// SaveState. SaveState is called asynchronously to TaskRunner.Run by
   157  	// AllocRunner, so all state fields must be synchronized using this
   158  	// lock.
   159  	persistLock sync.Mutex
   160  
   161  	// persistedHash is the hash of the last persisted snapshot. It is used to
   162  	// detect if a new snapshot has to be writen to disk.
   163  	persistedHash []byte
   164  }
   165  
   166  // taskRunnerState is used to snapshot the state of the task runner
   167  type taskRunnerState struct {
   168  	Version            string
   169  	HandleID           string
   170  	ArtifactDownloaded bool
   171  	TaskDirBuilt       bool
   172  	PayloadRendered    bool
   173  	CreatedResources   *driver.CreatedResources
   174  	DriverNetwork      *cstructs.DriverNetwork
   175  }
   176  
   177  func (s *taskRunnerState) Hash() []byte {
   178  	h := md5.New()
   179  
   180  	io.WriteString(h, s.Version)
   181  	io.WriteString(h, s.HandleID)
   182  	io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded))
   183  	io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt))
   184  	io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered))
   185  	h.Write(s.CreatedResources.Hash())
   186  	h.Write(s.DriverNetwork.Hash())
   187  
   188  	return h.Sum(nil)
   189  }
   190  
   191  // TaskStateUpdater is used to signal that tasks state has changed.
   192  type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent)
   193  
   194  // SignalEvent is a tuple of the signal and the event generating it
   195  type SignalEvent struct {
   196  	// s is the signal to be sent
   197  	s os.Signal
   198  
   199  	// e is the task event generating the signal
   200  	e *structs.TaskEvent
   201  
   202  	// result should be used to send back the result of the signal
   203  	result chan<- error
   204  }
   205  
   206  // NewTaskRunner is used to create a new task context
   207  func NewTaskRunner(logger *log.Logger, config *config.Config,
   208  	stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir,
   209  	alloc *structs.Allocation, task *structs.Task,
   210  	vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner {
   211  
   212  	// Merge in the task resources
   213  	task.Resources = alloc.TaskResources[task.Name]
   214  
   215  	// Build the restart tracker.
   216  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   217  	if tg == nil {
   218  		logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup)
   219  		return nil
   220  	}
   221  	restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type)
   222  
   223  	// Initialize the environment builder
   224  	envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region)
   225  
   226  	tc := &TaskRunner{
   227  		config:           config,
   228  		stateDB:          stateDB,
   229  		updater:          updater,
   230  		logger:           logger,
   231  		restartTracker:   restartTracker,
   232  		alloc:            alloc,
   233  		task:             task,
   234  		taskDir:          taskDir,
   235  		envBuilder:       envBuilder,
   236  		createdResources: driver.NewCreatedResources(),
   237  		consul:           consulClient,
   238  		vaultClient:      vaultClient,
   239  		vaultFuture:      NewTokenFuture().Set(""),
   240  		updateCh:         make(chan *structs.Allocation, 64),
   241  		destroyCh:        make(chan struct{}),
   242  		waitCh:           make(chan struct{}),
   243  		startCh:          make(chan struct{}, 1),
   244  		unblockCh:        make(chan struct{}),
   245  		restartCh:        make(chan *structs.TaskEvent),
   246  		signalCh:         make(chan SignalEvent),
   247  	}
   248  
   249  	return tc
   250  }
   251  
   252  // MarkReceived marks the task as received.
   253  func (r *TaskRunner) MarkReceived() {
   254  	r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived))
   255  }
   256  
   257  // WaitCh returns a channel to wait for termination
   258  func (r *TaskRunner) WaitCh() <-chan struct{} {
   259  	return r.waitCh
   260  }
   261  
   262  // getHandle returns the task's handle or nil
   263  func (r *TaskRunner) getHandle() driver.DriverHandle {
   264  	r.handleLock.Lock()
   265  	h := r.handle
   266  	r.handleLock.Unlock()
   267  	return h
   268  }
   269  
   270  // pre060StateFilePath returns the path to our state file that would have been
   271  // written pre v0.6.0
   272  // COMPAT: Remove in 0.7.0
   273  func (r *TaskRunner) pre060StateFilePath() string {
   274  	// Get the MD5 of the task name
   275  	hashVal := md5.Sum([]byte(r.task.Name))
   276  	hashHex := hex.EncodeToString(hashVal[:])
   277  	dirName := fmt.Sprintf("task-%s", hashHex)
   278  
   279  	// Generate the path
   280  	return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json")
   281  }
   282  
   283  // RestoreState is used to restore our state. If a non-empty string is returned
   284  // the task is restarted with the string as the reason. This is useful for
   285  // backwards incompatible upgrades that need to restart tasks with a new
   286  // executor.
   287  func (r *TaskRunner) RestoreState() (string, error) {
   288  	// COMPAT: Remove in 0.7.0
   289  	// 0.6.0 transistioned from individual state files to a single bolt-db.
   290  	// The upgrade path is to:
   291  	// Check if old state exists
   292  	//   If so, restore from that and delete old state
   293  	// Restore using state database
   294  
   295  	var snap taskRunnerState
   296  
   297  	// Check if the old snapshot is there
   298  	oldPath := r.pre060StateFilePath()
   299  	if err := pre060RestoreState(oldPath, &snap); err == nil {
   300  		// Delete the old state
   301  		os.RemoveAll(oldPath)
   302  	} else if !os.IsNotExist(err) {
   303  		// Something corrupt in the old state file
   304  		return "", err
   305  	} else {
   306  		// We are doing a normal restore
   307  		err := r.stateDB.View(func(tx *bolt.Tx) error {
   308  			bkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
   309  			if err != nil {
   310  				return fmt.Errorf("failed to get task bucket: %v", err)
   311  			}
   312  
   313  			if err := getObject(bkt, taskRunnerStateAllKey, &snap); err != nil {
   314  				return fmt.Errorf("failed to read task runner state: %v", err)
   315  			}
   316  			return nil
   317  		})
   318  		if err != nil {
   319  			return "", err
   320  		}
   321  
   322  	}
   323  
   324  	// Restore fields from the snapshot
   325  	r.artifactsDownloaded = snap.ArtifactDownloaded
   326  	r.taskDirBuilt = snap.TaskDirBuilt
   327  	r.payloadRendered = snap.PayloadRendered
   328  	r.setCreatedResources(snap.CreatedResources)
   329  	r.driverNet = snap.DriverNetwork
   330  
   331  	if r.task.Vault != nil {
   332  		// Read the token from the secret directory
   333  		tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   334  		data, err := ioutil.ReadFile(tokenPath)
   335  		if err != nil {
   336  			if !os.IsNotExist(err) {
   337  				return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   338  			}
   339  
   340  			// Token file doesn't exist
   341  		} else {
   342  			// Store the recovered token
   343  			r.recoveredVaultToken = string(data)
   344  		}
   345  	}
   346  
   347  	// Restore the driver
   348  	restartReason := ""
   349  	if snap.HandleID != "" {
   350  		d, err := r.createDriver()
   351  		if err != nil {
   352  			return "", err
   353  		}
   354  
   355  		// Add the restored network driver to the environment
   356  		r.envBuilder.SetDriverNetwork(r.driverNet)
   357  
   358  		// Open a connection to the driver handle
   359  		ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
   360  		handle, err := d.Open(ctx, snap.HandleID)
   361  
   362  		// In the case it fails, we relaunch the task in the Run() method.
   363  		if err != nil {
   364  			r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v",
   365  				r.task.Name, r.alloc.ID, err)
   366  			return "", nil
   367  		}
   368  
   369  		if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) {
   370  			restartReason = pre06ScriptCheckReason
   371  		}
   372  
   373  		if err := r.registerServices(d, handle, r.driverNet); err != nil {
   374  			// Don't hard fail here as there's a chance this task
   375  			// registered with Consul properly when it initial
   376  			// started.
   377  			r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v",
   378  				r.task.Name, r.alloc.ID, err)
   379  		}
   380  
   381  		r.handleLock.Lock()
   382  		r.handle = handle
   383  		r.handleLock.Unlock()
   384  
   385  		r.runningLock.Lock()
   386  		r.running = true
   387  		r.runningLock.Unlock()
   388  	}
   389  	return restartReason, nil
   390  }
   391  
   392  // ver06 is used for checking for pre-0.6 script checks
   393  var ver06 = version.Must(version.NewVersion("0.6.0dev"))
   394  
   395  // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script
   396  // check is found on an exec/java task.
   397  const pre06ScriptCheckReason = "upgrading pre-0.6 script checks"
   398  
   399  // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script
   400  // check, and uses exec or java drivers.
   401  func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool {
   402  	if driver != "exec" && driver != "java" && driver != "mock_driver" {
   403  		// Only exec and java are affected
   404  		return false
   405  	}
   406  	v, err := version.NewVersion(ver)
   407  	if err != nil {
   408  		// Treat it as old
   409  		return true
   410  	}
   411  	if !v.LessThan(ver06) {
   412  		// >= 0.6.0dev
   413  		return false
   414  	}
   415  	for _, service := range services {
   416  		for _, check := range service.Checks {
   417  			if check.Type == "script" {
   418  				return true
   419  			}
   420  		}
   421  	}
   422  	return false
   423  }
   424  
   425  // SaveState is used to snapshot our state
   426  func (r *TaskRunner) SaveState() error {
   427  	r.destroyLock.Lock()
   428  	defer r.destroyLock.Unlock()
   429  	if r.destroy {
   430  		// Don't save state if already destroyed
   431  		return nil
   432  	}
   433  
   434  	r.persistLock.Lock()
   435  	defer r.persistLock.Unlock()
   436  	snap := taskRunnerState{
   437  		Version:            r.config.Version,
   438  		ArtifactDownloaded: r.artifactsDownloaded,
   439  		TaskDirBuilt:       r.taskDirBuilt,
   440  		PayloadRendered:    r.payloadRendered,
   441  		CreatedResources:   r.getCreatedResources(),
   442  	}
   443  
   444  	r.handleLock.Lock()
   445  	if r.handle != nil {
   446  		snap.HandleID = r.handle.ID()
   447  	}
   448  	r.handleLock.Unlock()
   449  
   450  	r.driverNetLock.Lock()
   451  	snap.DriverNetwork = r.driverNet.Copy()
   452  	r.driverNetLock.Unlock()
   453  
   454  	// If nothing has changed avoid the write
   455  	h := snap.Hash()
   456  	if bytes.Equal(h, r.persistedHash) {
   457  		return nil
   458  	}
   459  
   460  	// Serialize the object
   461  	var buf bytes.Buffer
   462  	if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil {
   463  		return fmt.Errorf("failed to serialize snapshot: %v", err)
   464  	}
   465  
   466  	// Start the transaction.
   467  	return r.stateDB.Batch(func(tx *bolt.Tx) error {
   468  		// Grab the task bucket
   469  		taskBkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
   470  		if err != nil {
   471  			return fmt.Errorf("failed to retrieve allocation bucket: %v", err)
   472  		}
   473  
   474  		if err := putData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil {
   475  			return fmt.Errorf("failed to write task_runner state: %v", err)
   476  		}
   477  
   478  		// Store the hash that was persisted
   479  		tx.OnCommit(func() {
   480  			r.persistedHash = h
   481  		})
   482  
   483  		return nil
   484  	})
   485  }
   486  
   487  // DestroyState is used to cleanup after ourselves
   488  func (r *TaskRunner) DestroyState() error {
   489  	r.persistLock.Lock()
   490  	defer r.persistLock.Unlock()
   491  
   492  	return r.stateDB.Update(func(tx *bolt.Tx) error {
   493  		if err := deleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil {
   494  			return fmt.Errorf("failed to delete task bucket: %v", err)
   495  		}
   496  		return nil
   497  	})
   498  }
   499  
   500  // setState is used to update the state of the task runner
   501  func (r *TaskRunner) setState(state string, event *structs.TaskEvent) {
   502  	// Persist our state to disk.
   503  	if err := r.SaveState(); err != nil {
   504  		r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err)
   505  	}
   506  
   507  	// Indicate the task has been updated.
   508  	r.updater(r.task.Name, state, event)
   509  }
   510  
   511  // createDriver makes a driver for the task
   512  func (r *TaskRunner) createDriver() (driver.Driver, error) {
   513  	// Create a task-specific event emitter callback to expose minimal
   514  	// state to drivers
   515  	eventEmitter := func(m string, args ...interface{}) {
   516  		msg := fmt.Sprintf(m, args...)
   517  		r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg)
   518  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg))
   519  	}
   520  
   521  	driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter)
   522  	d, err := driver.NewDriver(r.task.Driver, driverCtx)
   523  	if err != nil {
   524  		return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
   525  			r.task.Driver, r.alloc.ID, err)
   526  	}
   527  
   528  	return d, err
   529  }
   530  
   531  // Run is a long running routine used to manage the task
   532  func (r *TaskRunner) Run() {
   533  	defer close(r.waitCh)
   534  	r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
   535  		r.task.Name, r.alloc.ID)
   536  
   537  	if err := r.validateTask(); err != nil {
   538  		r.setState(
   539  			structs.TaskStateDead,
   540  			structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask())
   541  		return
   542  	}
   543  
   544  	// Create a temporary driver so that we can determine the FSIsolation
   545  	// required. run->startTask will create a new driver after environment
   546  	// has been setup (env vars, templates, artifacts, secrets, etc).
   547  	tmpDrv, err := r.createDriver()
   548  	if err != nil {
   549  		e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   550  		r.setState(
   551  			structs.TaskStateDead,
   552  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask())
   553  		return
   554  	}
   555  
   556  	// Build base task directory structure regardless of FS isolation abilities.
   557  	// This needs to happen before we start the Vault manager and call prestart
   558  	// as both those can write to the task directories
   559  	if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil {
   560  		e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err)
   561  		r.setState(
   562  			structs.TaskStateDead,
   563  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask())
   564  		return
   565  	}
   566  
   567  	// If there is no Vault policy leave the static future created in
   568  	// NewTaskRunner
   569  	if r.task.Vault != nil {
   570  		// Start the go-routine to get a Vault token
   571  		r.vaultFuture.Clear()
   572  		go r.vaultManager(r.recoveredVaultToken)
   573  	}
   574  
   575  	// Start the run loop
   576  	r.run()
   577  
   578  	// Do any cleanup necessary
   579  	r.postrun()
   580  
   581  	return
   582  }
   583  
   584  // validateTask validates the fields of the task and returns an error if the
   585  // task is invalid.
   586  func (r *TaskRunner) validateTask() error {
   587  	var mErr multierror.Error
   588  
   589  	// Validate the user.
   590  	unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist)
   591  	checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers)
   592  	if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch {
   593  		if _, unallowed := unallowedUsers[r.task.User]; unallowed {
   594  			mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User))
   595  		}
   596  	}
   597  
   598  	// Validate the artifacts
   599  	for i, artifact := range r.task.Artifacts {
   600  		// Verify the artifact doesn't escape the task directory.
   601  		if err := artifact.Validate(); err != nil {
   602  			// If this error occurs there is potentially a server bug or
   603  			// mallicious, server spoofing.
   604  			r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v",
   605  				r.alloc.ID, r.task.Name, artifact, i, err)
   606  			mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err))
   607  		}
   608  	}
   609  
   610  	// Validate the Service names
   611  	taskEnv := r.envBuilder.Build()
   612  	for i, service := range r.task.Services {
   613  		name := taskEnv.ReplaceEnv(service.Name)
   614  		if err := service.ValidateName(name); err != nil {
   615  			mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err))
   616  		}
   617  	}
   618  
   619  	if len(mErr.Errors) == 1 {
   620  		return mErr.Errors[0]
   621  	}
   622  	return mErr.ErrorOrNil()
   623  }
   624  
   625  // tokenFuture stores the Vault token and allows consumers to block till a valid
   626  // token exists
   627  type tokenFuture struct {
   628  	waiting []chan struct{}
   629  	token   string
   630  	set     bool
   631  	m       sync.Mutex
   632  }
   633  
   634  // NewTokenFuture returns a new token future without any token set
   635  func NewTokenFuture() *tokenFuture {
   636  	return &tokenFuture{}
   637  }
   638  
   639  // Wait returns a channel that can be waited on. When this channel unblocks, a
   640  // valid token will be available via the Get method
   641  func (f *tokenFuture) Wait() <-chan struct{} {
   642  	f.m.Lock()
   643  	defer f.m.Unlock()
   644  
   645  	c := make(chan struct{})
   646  	if f.set {
   647  		close(c)
   648  		return c
   649  	}
   650  
   651  	f.waiting = append(f.waiting, c)
   652  	return c
   653  }
   654  
   655  // Set sets the token value and unblocks any caller of Wait
   656  func (f *tokenFuture) Set(token string) *tokenFuture {
   657  	f.m.Lock()
   658  	defer f.m.Unlock()
   659  
   660  	f.set = true
   661  	f.token = token
   662  	for _, w := range f.waiting {
   663  		close(w)
   664  	}
   665  	f.waiting = nil
   666  	return f
   667  }
   668  
   669  // Clear clears the set vault token.
   670  func (f *tokenFuture) Clear() *tokenFuture {
   671  	f.m.Lock()
   672  	defer f.m.Unlock()
   673  
   674  	f.token = ""
   675  	f.set = false
   676  	return f
   677  }
   678  
   679  // Get returns the set Vault token
   680  func (f *tokenFuture) Get() string {
   681  	f.m.Lock()
   682  	defer f.m.Unlock()
   683  	return f.token
   684  }
   685  
   686  // vaultManager should be called in a go-routine and manages the derivation,
   687  // renewal and handling of errors with the Vault token. The optional parameter
   688  // allows setting the initial Vault token. This is useful when the Vault token
   689  // is recovered off disk.
   690  func (r *TaskRunner) vaultManager(token string) {
   691  	// Helper for stopping token renewal
   692  	stopRenewal := func() {
   693  		if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil {
   694  			r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   695  		}
   696  	}
   697  
   698  	// updatedToken lets us store state between loops. If true, a new token
   699  	// has been retrieved and we need to apply the Vault change mode
   700  	var updatedToken bool
   701  
   702  OUTER:
   703  	for {
   704  		// Check if we should exit
   705  		select {
   706  		case <-r.waitCh:
   707  			stopRenewal()
   708  			return
   709  		default:
   710  		}
   711  
   712  		// Clear the token
   713  		r.vaultFuture.Clear()
   714  
   715  		// Check if there already is a token which can be the case for
   716  		// restoring the TaskRunner
   717  		if token == "" {
   718  			// Get a token
   719  			var exit bool
   720  			token, exit = r.deriveVaultToken()
   721  			if exit {
   722  				// Exit the manager
   723  				return
   724  			}
   725  
   726  			// Write the token to disk
   727  			if err := r.writeToken(token); err != nil {
   728  				e := fmt.Errorf("failed to write Vault token to disk")
   729  				r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err)
   730  				r.Kill("vault", e.Error(), true)
   731  				return
   732  			}
   733  		}
   734  
   735  		// Start the renewal process
   736  		renewCh, err := r.vaultClient.RenewToken(token, 30)
   737  
   738  		// An error returned means the token is not being renewed
   739  		if err != nil {
   740  			r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   741  			token = ""
   742  			goto OUTER
   743  		}
   744  
   745  		// The Vault token is valid now, so set it
   746  		r.vaultFuture.Set(token)
   747  
   748  		if updatedToken {
   749  			switch r.task.Vault.ChangeMode {
   750  			case structs.VaultChangeModeSignal:
   751  				s, err := signals.Parse(r.task.Vault.ChangeSignal)
   752  				if err != nil {
   753  					e := fmt.Errorf("failed to parse signal: %v", err)
   754  					r.logger.Printf("[ERR] client: %v", err)
   755  					r.Kill("vault", e.Error(), true)
   756  					return
   757  				}
   758  
   759  				if err := r.Signal("vault", "new Vault token acquired", s); err != nil {
   760  					r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   761  					r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true)
   762  					return
   763  				}
   764  			case structs.VaultChangeModeRestart:
   765  				r.Restart("vault", "new Vault token acquired")
   766  			case structs.VaultChangeModeNoop:
   767  				fallthrough
   768  			default:
   769  				r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode)
   770  			}
   771  
   772  			// We have handled it
   773  			updatedToken = false
   774  
   775  			// Call the handler
   776  			r.updatedTokenHandler()
   777  		}
   778  
   779  		// Start watching for renewal errors
   780  		select {
   781  		case err := <-renewCh:
   782  			// Clear the token
   783  			token = ""
   784  			r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   785  			stopRenewal()
   786  
   787  			// Check if we have to do anything
   788  			if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop {
   789  				updatedToken = true
   790  			}
   791  		case <-r.waitCh:
   792  			stopRenewal()
   793  			return
   794  		}
   795  	}
   796  }
   797  
   798  // deriveVaultToken derives the Vault token using exponential backoffs. It
   799  // returns the Vault token and whether the manager should exit.
   800  func (r *TaskRunner) deriveVaultToken() (token string, exit bool) {
   801  	attempts := 0
   802  	for {
   803  		tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name})
   804  		if err == nil {
   805  			return tokens[r.task.Name], false
   806  		}
   807  
   808  		// Check if we can't recover from the error
   809  		if !structs.IsRecoverable(err) {
   810  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   811  				r.task.Name, r.alloc.ID, err)
   812  			r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true)
   813  			return "", true
   814  		}
   815  
   816  		// Handle the retry case
   817  		backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
   818  		if backoff > vaultBackoffLimit {
   819  			backoff = vaultBackoffLimit
   820  		}
   821  		r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v",
   822  			r.task.Name, r.alloc.ID, err, backoff)
   823  
   824  		attempts++
   825  
   826  		// Wait till retrying
   827  		select {
   828  		case <-r.waitCh:
   829  			return "", true
   830  		case <-time.After(backoff):
   831  		}
   832  	}
   833  }
   834  
   835  // writeToken writes the given token to disk
   836  func (r *TaskRunner) writeToken(token string) error {
   837  	tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   838  	if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil {
   839  		return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   840  	}
   841  
   842  	return nil
   843  }
   844  
   845  // updatedTokenHandler is called when a new Vault token is retrieved. Things
   846  // that rely on the token should be updated here.
   847  func (r *TaskRunner) updatedTokenHandler() {
   848  
   849  	// Update the tasks environment
   850  	r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env)
   851  
   852  	if r.templateManager != nil {
   853  		r.templateManager.Stop()
   854  
   855  		// Create a new templateManager
   856  		var err error
   857  		r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates,
   858  			r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.envBuilder)
   859  		if err != nil {
   860  			err := fmt.Errorf("failed to build task's template manager: %v", err)
   861  			r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   862  			r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
   863  			r.Kill("vault", err.Error(), true)
   864  			return
   865  		}
   866  	}
   867  }
   868  
   869  // prestart handles life-cycle tasks that occur before the task has started.
   870  // Since it's run asynchronously with the main Run() loop the alloc & task are
   871  // passed in to avoid racing with updates.
   872  func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) {
   873  	if task.Vault != nil {
   874  		// Wait for the token
   875  		r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID)
   876  		tokenCh := r.vaultFuture.Wait()
   877  		select {
   878  		case <-tokenCh:
   879  		case <-r.waitCh:
   880  			resultCh <- false
   881  			return
   882  		}
   883  		r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID)
   884  		r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env)
   885  	}
   886  
   887  	// If the job is a dispatch job and there is a payload write it to disk
   888  	requirePayload := len(alloc.Job.Payload) != 0 &&
   889  		(r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "")
   890  	if !r.payloadRendered && requirePayload {
   891  		renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File)
   892  		decoded, err := snappy.Decode(nil, alloc.Job.Payload)
   893  		if err != nil {
   894  			r.setState(
   895  				structs.TaskStateDead,
   896  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   897  			resultCh <- false
   898  			return
   899  		}
   900  
   901  		if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil {
   902  			r.setState(
   903  				structs.TaskStateDead,
   904  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   905  			resultCh <- false
   906  			return
   907  		}
   908  
   909  		if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil {
   910  			r.setState(
   911  				structs.TaskStateDead,
   912  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   913  			resultCh <- false
   914  			return
   915  		}
   916  
   917  		r.payloadRendered = true
   918  	}
   919  
   920  	for {
   921  		r.persistLock.Lock()
   922  		downloaded := r.artifactsDownloaded
   923  		r.persistLock.Unlock()
   924  
   925  		// Download the task's artifacts
   926  		if !downloaded && len(task.Artifacts) > 0 {
   927  			r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts))
   928  			taskEnv := r.envBuilder.Build()
   929  			for _, artifact := range task.Artifacts {
   930  				if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil {
   931  					wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err)
   932  					r.logger.Printf("[DEBUG] client: %v", wrapped)
   933  					r.setState(structs.TaskStatePending,
   934  						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped))
   935  					r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err))
   936  					goto RESTART
   937  				}
   938  			}
   939  
   940  			r.persistLock.Lock()
   941  			r.artifactsDownloaded = true
   942  			r.persistLock.Unlock()
   943  		}
   944  
   945  		// We don't have to wait for any template
   946  		if len(task.Templates) == 0 {
   947  			// Send the start signal
   948  			select {
   949  			case r.startCh <- struct{}{}:
   950  			default:
   951  			}
   952  
   953  			resultCh <- true
   954  			return
   955  		}
   956  
   957  		// Build the template manager
   958  		if r.templateManager == nil {
   959  			var err error
   960  			r.templateManager, err = NewTaskTemplateManager(r, task.Templates,
   961  				r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.envBuilder)
   962  			if err != nil {
   963  				err := fmt.Errorf("failed to build task's template manager: %v", err)
   964  				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask())
   965  				r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err)
   966  				resultCh <- false
   967  				return
   968  			}
   969  		}
   970  
   971  		// Block for consul-template
   972  		// TODO Hooks should register themselves as blocking and then we can
   973  		// perioidcally enumerate what we are still blocked on
   974  		select {
   975  		case <-r.unblockCh:
   976  			// Send the start signal
   977  			select {
   978  			case r.startCh <- struct{}{}:
   979  			default:
   980  			}
   981  
   982  			resultCh <- true
   983  			return
   984  		case <-r.waitCh:
   985  			// The run loop has exited so exit too
   986  			resultCh <- false
   987  			return
   988  		}
   989  
   990  	RESTART:
   991  		restart := r.shouldRestart()
   992  		if !restart {
   993  			resultCh <- false
   994  			return
   995  		}
   996  	}
   997  }
   998  
   999  // postrun is used to do any cleanup that is necessary after exiting the runloop
  1000  func (r *TaskRunner) postrun() {
  1001  	// Stop the template manager
  1002  	if r.templateManager != nil {
  1003  		r.templateManager.Stop()
  1004  	}
  1005  }
  1006  
  1007  // run is the main run loop that handles starting the application, destroying
  1008  // it, restarts and signals.
  1009  func (r *TaskRunner) run() {
  1010  	// Predeclare things so we can jump to the RESTART
  1011  	var stopCollection chan struct{}
  1012  	var handleWaitCh chan *dstructs.WaitResult
  1013  
  1014  	// If we already have a handle, populate the stopCollection and handleWaitCh
  1015  	// to fix the invariant that it exists.
  1016  	handleEmpty := r.getHandle() == nil
  1017  
  1018  	if !handleEmpty {
  1019  		stopCollection = make(chan struct{})
  1020  		go r.collectResourceUsageStats(stopCollection)
  1021  		handleWaitCh = r.handle.WaitCh()
  1022  	}
  1023  
  1024  	for {
  1025  		// Do the prestart activities
  1026  		prestartResultCh := make(chan bool, 1)
  1027  		go r.prestart(r.alloc, r.task, prestartResultCh)
  1028  
  1029  	WAIT:
  1030  		for {
  1031  			select {
  1032  			case success := <-prestartResultCh:
  1033  				if !success {
  1034  					r.cleanup()
  1035  					r.setState(structs.TaskStateDead, nil)
  1036  					return
  1037  				}
  1038  			case <-r.startCh:
  1039  				// Start the task if not yet started or it is being forced. This logic
  1040  				// is necessary because in the case of a restore the handle already
  1041  				// exists.
  1042  				handleEmpty := r.getHandle() == nil
  1043  				if handleEmpty {
  1044  					startErr := r.startTask()
  1045  					r.restartTracker.SetStartError(startErr)
  1046  					if startErr != nil {
  1047  						r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr))
  1048  						goto RESTART
  1049  					}
  1050  
  1051  					// Mark the task as started
  1052  					r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
  1053  					r.runningLock.Lock()
  1054  					r.running = true
  1055  					r.runningLock.Unlock()
  1056  
  1057  					if stopCollection == nil {
  1058  						stopCollection = make(chan struct{})
  1059  						go r.collectResourceUsageStats(stopCollection)
  1060  					}
  1061  
  1062  					handleWaitCh = r.handle.WaitCh()
  1063  				}
  1064  
  1065  			case waitRes := <-handleWaitCh:
  1066  				if waitRes == nil {
  1067  					panic("nil wait")
  1068  				}
  1069  
  1070  				r.runningLock.Lock()
  1071  				r.running = false
  1072  				r.runningLock.Unlock()
  1073  
  1074  				// Stop collection of the task's resource usage
  1075  				close(stopCollection)
  1076  
  1077  				// Log whether the task was successful or not.
  1078  				r.restartTracker.SetWaitResult(waitRes)
  1079  				r.setState("", r.waitErrorToEvent(waitRes))
  1080  				if !waitRes.Successful() {
  1081  					r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes)
  1082  				} else {
  1083  					r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID)
  1084  				}
  1085  
  1086  				break WAIT
  1087  			case update := <-r.updateCh:
  1088  				if err := r.handleUpdate(update); err != nil {
  1089  					r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err)
  1090  				}
  1091  
  1092  			case se := <-r.signalCh:
  1093  				r.runningLock.Lock()
  1094  				running := r.running
  1095  				r.runningLock.Unlock()
  1096  				common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID)
  1097  				if !running {
  1098  					// Send no error
  1099  					r.logger.Printf("[DEBUG] client: skipping %s", common)
  1100  					se.result <- nil
  1101  					continue
  1102  				}
  1103  
  1104  				r.logger.Printf("[DEBUG] client: sending %s", common)
  1105  				r.setState(structs.TaskStateRunning, se.e)
  1106  
  1107  				res := r.handle.Signal(se.s)
  1108  				se.result <- res
  1109  
  1110  			case event := <-r.restartCh:
  1111  				r.runningLock.Lock()
  1112  				running := r.running
  1113  				r.runningLock.Unlock()
  1114  				common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID)
  1115  				if !running {
  1116  					r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common)
  1117  					continue
  1118  				}
  1119  
  1120  				r.logger.Printf("[DEBUG] client: restarting %s: %v", common, event.RestartReason)
  1121  				r.setState(structs.TaskStateRunning, event)
  1122  				r.killTask(nil)
  1123  
  1124  				close(stopCollection)
  1125  
  1126  				if handleWaitCh != nil {
  1127  					<-handleWaitCh
  1128  				}
  1129  
  1130  				// Since the restart isn't from a failure, restart immediately
  1131  				// and don't count against the restart policy
  1132  				r.restartTracker.SetRestartTriggered()
  1133  				break WAIT
  1134  
  1135  			case <-r.destroyCh:
  1136  				r.runningLock.Lock()
  1137  				running := r.running
  1138  				r.runningLock.Unlock()
  1139  				if !running {
  1140  					r.cleanup()
  1141  					r.setState(structs.TaskStateDead, r.destroyEvent)
  1142  					return
  1143  				}
  1144  
  1145  				// Remove from consul before killing the task so that traffic
  1146  				// can be rerouted
  1147  				interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1148  				r.consul.RemoveTask(r.alloc.ID, interpTask)
  1149  
  1150  				// Store the task event that provides context on the task
  1151  				// destroy. The Killed event is set from the alloc_runner and
  1152  				// doesn't add detail
  1153  				var killEvent *structs.TaskEvent
  1154  				if r.destroyEvent.Type != structs.TaskKilled {
  1155  					if r.destroyEvent.Type == structs.TaskKilling {
  1156  						killEvent = r.destroyEvent
  1157  					} else {
  1158  						r.setState(structs.TaskStateRunning, r.destroyEvent)
  1159  					}
  1160  				}
  1161  
  1162  				r.killTask(killEvent)
  1163  				close(stopCollection)
  1164  
  1165  				// Wait for handler to exit before calling cleanup
  1166  				<-handleWaitCh
  1167  				r.cleanup()
  1168  
  1169  				r.setState(structs.TaskStateDead, nil)
  1170  				return
  1171  			}
  1172  		}
  1173  
  1174  	RESTART:
  1175  		// shouldRestart will block if the task should restart after a delay.
  1176  		restart := r.shouldRestart()
  1177  		if !restart {
  1178  			r.cleanup()
  1179  			r.setState(structs.TaskStateDead, nil)
  1180  			return
  1181  		}
  1182  
  1183  		// Clear the handle so a new driver will be created.
  1184  		r.handleLock.Lock()
  1185  		r.handle = nil
  1186  		handleWaitCh = nil
  1187  		stopCollection = nil
  1188  		r.handleLock.Unlock()
  1189  	}
  1190  }
  1191  
  1192  // cleanup removes Consul entries and calls Driver.Cleanup when a task is
  1193  // stopping. Errors are logged.
  1194  func (r *TaskRunner) cleanup() {
  1195  	// Remove from Consul
  1196  	interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1197  	r.consul.RemoveTask(r.alloc.ID, interpTask)
  1198  
  1199  	drv, err := r.createDriver()
  1200  	if err != nil {
  1201  		r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err)
  1202  		return
  1203  	}
  1204  
  1205  	res := r.getCreatedResources()
  1206  
  1207  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1208  	attempts := 1
  1209  	var cleanupErr error
  1210  	for retry := true; retry; attempts++ {
  1211  		cleanupErr = drv.Cleanup(ctx, res)
  1212  		retry = structs.IsRecoverable(cleanupErr)
  1213  
  1214  		// Copy current createdResources state in case SaveState is
  1215  		// called between retries
  1216  		r.setCreatedResources(res)
  1217  
  1218  		// Retry 3 times with sleeps between
  1219  		if !retry || attempts > 3 {
  1220  			break
  1221  		}
  1222  		time.Sleep(time.Duration(attempts) * time.Second)
  1223  	}
  1224  
  1225  	if cleanupErr != nil {
  1226  		r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr)
  1227  	}
  1228  	return
  1229  }
  1230  
  1231  // shouldRestart returns if the task should restart. If the return value is
  1232  // true, the task's restart policy has already been considered and any wait time
  1233  // between restarts has been applied.
  1234  func (r *TaskRunner) shouldRestart() bool {
  1235  	state, when := r.restartTracker.GetState()
  1236  	reason := r.restartTracker.GetReason()
  1237  	switch state {
  1238  	case structs.TaskNotRestarting, structs.TaskTerminated:
  1239  		r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
  1240  		if state == structs.TaskNotRestarting {
  1241  			r.setState(structs.TaskStateDead,
  1242  				structs.NewTaskEvent(structs.TaskNotRestarting).
  1243  					SetRestartReason(reason).SetFailsTask())
  1244  		}
  1245  		return false
  1246  	case structs.TaskRestarting:
  1247  		r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
  1248  		r.setState(structs.TaskStatePending,
  1249  			structs.NewTaskEvent(structs.TaskRestarting).
  1250  				SetRestartDelay(when).
  1251  				SetRestartReason(reason))
  1252  	default:
  1253  		r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
  1254  		return false
  1255  	}
  1256  
  1257  	// Unregister from Consul while waiting to restart.
  1258  	interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1259  	r.consul.RemoveTask(r.alloc.ID, interpTask)
  1260  
  1261  	// Sleep but watch for destroy events.
  1262  	select {
  1263  	case <-time.After(when):
  1264  	case <-r.destroyCh:
  1265  	}
  1266  
  1267  	// Destroyed while we were waiting to restart, so abort.
  1268  	r.destroyLock.Lock()
  1269  	destroyed := r.destroy
  1270  	r.destroyLock.Unlock()
  1271  	if destroyed {
  1272  		r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name)
  1273  		r.setState(structs.TaskStateDead, r.destroyEvent)
  1274  		return false
  1275  	}
  1276  
  1277  	return true
  1278  }
  1279  
  1280  // killTask kills the running task. A killing event can optionally be passed and
  1281  // this event is used to mark the task as being killed. It provides a means to
  1282  // store extra information.
  1283  func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) {
  1284  	r.runningLock.Lock()
  1285  	running := r.running
  1286  	r.runningLock.Unlock()
  1287  	if !running {
  1288  		return
  1289  	}
  1290  
  1291  	// Get the kill timeout
  1292  	timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)
  1293  
  1294  	// Build the event
  1295  	var event *structs.TaskEvent
  1296  	if killingEvent != nil {
  1297  		event = killingEvent
  1298  		event.Type = structs.TaskKilling
  1299  	} else {
  1300  		event = structs.NewTaskEvent(structs.TaskKilling)
  1301  	}
  1302  	event.SetKillTimeout(timeout)
  1303  
  1304  	// Mark that we received the kill event
  1305  	r.setState(structs.TaskStateRunning, event)
  1306  
  1307  	handle := r.getHandle()
  1308  
  1309  	// Kill the task using an exponential backoff in-case of failures.
  1310  	destroySuccess, err := r.handleDestroy(handle)
  1311  	if !destroySuccess {
  1312  		// We couldn't successfully destroy the resource created.
  1313  		r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
  1314  	}
  1315  
  1316  	r.runningLock.Lock()
  1317  	r.running = false
  1318  	r.runningLock.Unlock()
  1319  
  1320  	// Store that the task has been destroyed and any associated error.
  1321  	r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err))
  1322  }
  1323  
  1324  // startTask creates the driver, task dir, and starts the task.
  1325  func (r *TaskRunner) startTask() error {
  1326  	// Create a driver
  1327  	drv, err := r.createDriver()
  1328  	if err != nil {
  1329  		return fmt.Errorf("failed to create driver of task %q for alloc %q: %v",
  1330  			r.task.Name, r.alloc.ID, err)
  1331  	}
  1332  
  1333  	// Run prestart
  1334  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1335  	presp, err := drv.Prestart(ctx, r.task)
  1336  
  1337  	// Merge newly created resources into previously created resources
  1338  	if presp != nil {
  1339  		r.createdResourcesLock.Lock()
  1340  		r.createdResources.Merge(presp.CreatedResources)
  1341  		r.createdResourcesLock.Unlock()
  1342  
  1343  		// Set any network configuration returned by the driver
  1344  		r.envBuilder.SetDriverNetwork(presp.Network)
  1345  	}
  1346  
  1347  	if err != nil {
  1348  		wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v",
  1349  			r.task.Name, r.alloc.ID, err)
  1350  		r.logger.Printf("[WARN] client: error from prestart: %s", wrapped)
  1351  		return structs.WrapRecoverable(wrapped, err)
  1352  	}
  1353  
  1354  	// Create a new context for Start since the environment may have been updated.
  1355  	ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1356  
  1357  	// Start the job
  1358  	sresp, err := drv.Start(ctx, r.task)
  1359  	if err != nil {
  1360  		wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v",
  1361  			r.task.Name, r.alloc.ID, err)
  1362  		r.logger.Printf("[WARN] client: %s", wrapped)
  1363  		return structs.WrapRecoverable(wrapped, err)
  1364  
  1365  	}
  1366  
  1367  	// Update environment with the network defined by the driver's Start method.
  1368  	r.envBuilder.SetDriverNetwork(sresp.Network)
  1369  
  1370  	if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil {
  1371  		// All IO is done asynchronously, so errors from registering
  1372  		// services are hard failures.
  1373  		r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err)
  1374  
  1375  		// Kill the started task
  1376  		if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed {
  1377  			r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v",
  1378  				r.task.Name, r.alloc.ID, err)
  1379  		}
  1380  		return structs.NewRecoverableError(err, false)
  1381  	}
  1382  
  1383  	r.handleLock.Lock()
  1384  	r.handle = sresp.Handle
  1385  	r.handleLock.Unlock()
  1386  
  1387  	// Need to persist the driver network between restarts
  1388  	r.driverNetLock.Lock()
  1389  	r.driverNet = sresp.Network
  1390  	r.driverNetLock.Unlock()
  1391  
  1392  	return nil
  1393  }
  1394  
  1395  // registerServices and checks with Consul.
  1396  func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error {
  1397  	var exec driver.ScriptExecutor
  1398  	if d.Abilities().Exec {
  1399  		// Allow set the script executor if the driver supports it
  1400  		exec = h
  1401  	}
  1402  	interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
  1403  	return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, exec, n)
  1404  }
  1405  
  1406  // interpolateServices interpolates tags in a service and checks with values from the
  1407  // task's environment.
  1408  func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task {
  1409  	taskCopy := task.Copy()
  1410  	for _, service := range taskCopy.Services {
  1411  		for _, check := range service.Checks {
  1412  			check.Name = taskEnv.ReplaceEnv(check.Name)
  1413  			check.Type = taskEnv.ReplaceEnv(check.Type)
  1414  			check.Command = taskEnv.ReplaceEnv(check.Command)
  1415  			check.Args = taskEnv.ParseAndReplace(check.Args)
  1416  			check.Path = taskEnv.ReplaceEnv(check.Path)
  1417  			check.Protocol = taskEnv.ReplaceEnv(check.Protocol)
  1418  			check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel)
  1419  			check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus)
  1420  		}
  1421  		service.Name = taskEnv.ReplaceEnv(service.Name)
  1422  		service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel)
  1423  		service.Tags = taskEnv.ParseAndReplace(service.Tags)
  1424  	}
  1425  	return taskCopy
  1426  }
  1427  
  1428  // buildTaskDir creates the task directory before driver.Prestart. It is safe
  1429  // to call multiple times as its state is persisted.
  1430  func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error {
  1431  	r.persistLock.Lock()
  1432  	built := r.taskDirBuilt
  1433  	r.persistLock.Unlock()
  1434  
  1435  	// We do not set the state again since this only occurs during restoration
  1436  	// and the task dir is already built. The reason we call Build again is to
  1437  	// ensure that the task dir invariants are still held.
  1438  	if !built {
  1439  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskSetup).
  1440  			SetMessage(structs.TaskBuildingTaskDir))
  1441  	}
  1442  
  1443  	chroot := config.DefaultChrootEnv
  1444  	if len(r.config.ChrootEnv) > 0 {
  1445  		chroot = r.config.ChrootEnv
  1446  	}
  1447  	if err := r.taskDir.Build(built, chroot, fsi); err != nil {
  1448  		return err
  1449  	}
  1450  
  1451  	// Mark task dir as successfully built
  1452  	r.persistLock.Lock()
  1453  	r.taskDirBuilt = true
  1454  	r.persistLock.Unlock()
  1455  
  1456  	// Set path and host related env vars
  1457  	driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config)
  1458  	return nil
  1459  }
  1460  
  1461  // collectResourceUsageStats starts collecting resource usage stats of a Task.
  1462  // Collection ends when the passed channel is closed
  1463  func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) {
  1464  	// start collecting the stats right away and then start collecting every
  1465  	// collection interval
  1466  	next := time.NewTimer(0)
  1467  	defer next.Stop()
  1468  	for {
  1469  		select {
  1470  		case <-next.C:
  1471  			next.Reset(r.config.StatsCollectionInterval)
  1472  			handle := r.getHandle()
  1473  			if handle == nil {
  1474  				continue
  1475  			}
  1476  			ru, err := handle.Stats()
  1477  
  1478  			if err != nil {
  1479  				// Check if the driver doesn't implement stats
  1480  				if err.Error() == driver.DriverStatsNotImplemented.Error() {
  1481  					r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID)
  1482  					return
  1483  				}
  1484  
  1485  				// We do not log when the plugin is shutdown as this is simply a
  1486  				// race between the stopCollection channel being closed and calling
  1487  				// Stats on the handle.
  1488  				if !strings.Contains(err.Error(), "connection is shut down") {
  1489  					r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err)
  1490  				}
  1491  				continue
  1492  			}
  1493  
  1494  			r.resourceUsageLock.Lock()
  1495  			r.resourceUsage = ru
  1496  			r.resourceUsageLock.Unlock()
  1497  			if ru != nil {
  1498  				r.emitStats(ru)
  1499  			}
  1500  		case <-stopCollection:
  1501  			return
  1502  		}
  1503  	}
  1504  }
  1505  
  1506  // LatestResourceUsage returns the last resource utilization datapoint collected
  1507  func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1508  	r.resourceUsageLock.RLock()
  1509  	defer r.resourceUsageLock.RUnlock()
  1510  	r.runningLock.Lock()
  1511  	defer r.runningLock.Unlock()
  1512  
  1513  	// If the task is not running there can be no latest resource
  1514  	if !r.running {
  1515  		return nil
  1516  	}
  1517  
  1518  	return r.resourceUsage
  1519  }
  1520  
  1521  // handleUpdate takes an updated allocation and updates internal state to
  1522  // reflect the new config for the task.
  1523  func (r *TaskRunner) handleUpdate(update *structs.Allocation) error {
  1524  	// Extract the task group from the alloc.
  1525  	tg := update.Job.LookupTaskGroup(update.TaskGroup)
  1526  	if tg == nil {
  1527  		return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup)
  1528  	}
  1529  
  1530  	// Extract the task.
  1531  	var updatedTask *structs.Task
  1532  	for _, t := range tg.Tasks {
  1533  		if t.Name == r.task.Name {
  1534  			updatedTask = t.Copy()
  1535  		}
  1536  	}
  1537  	if updatedTask == nil {
  1538  		return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name)
  1539  	}
  1540  
  1541  	// Merge in the task resources
  1542  	updatedTask.Resources = update.TaskResources[updatedTask.Name]
  1543  
  1544  	// Update the task's environment for interpolating in services/checks
  1545  	r.envBuilder.UpdateTask(update, updatedTask)
  1546  
  1547  	var mErr multierror.Error
  1548  	r.handleLock.Lock()
  1549  	if r.handle != nil {
  1550  		drv, err := r.createDriver()
  1551  		if err != nil {
  1552  			// Something has really gone wrong; don't continue
  1553  			r.handleLock.Unlock()
  1554  			return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err)
  1555  		}
  1556  
  1557  		// Update will update resources and store the new kill timeout.
  1558  		if err := r.handle.Update(updatedTask); err != nil {
  1559  			mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err))
  1560  		}
  1561  
  1562  		// Update services in Consul
  1563  		if err := r.updateServices(drv, r.handle, r.task, updatedTask); err != nil {
  1564  			mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err))
  1565  		}
  1566  	}
  1567  	r.handleLock.Unlock()
  1568  
  1569  	// Update the restart policy.
  1570  	if r.restartTracker != nil {
  1571  		r.restartTracker.SetPolicy(tg.RestartPolicy)
  1572  	}
  1573  
  1574  	// Store the updated alloc.
  1575  	r.alloc = update
  1576  	r.task = updatedTask
  1577  	return mErr.ErrorOrNil()
  1578  }
  1579  
  1580  // updateServices and checks with Consul.
  1581  func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, old, new *structs.Task) error {
  1582  	var exec driver.ScriptExecutor
  1583  	if d.Abilities().Exec {
  1584  		// Allow set the script executor if the driver supports it
  1585  		exec = h
  1586  	}
  1587  	newInterpolatedTask := interpolateServices(r.envBuilder.Build(), new)
  1588  	oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), old)
  1589  	r.driverNetLock.Lock()
  1590  	net := r.driverNet.Copy()
  1591  	r.driverNetLock.Unlock()
  1592  	return r.consul.UpdateTask(r.alloc.ID, oldInterpolatedTask, newInterpolatedTask, exec, net)
  1593  }
  1594  
  1595  // handleDestroy kills the task handle. In the case that killing fails,
  1596  // handleDestroy will retry with an exponential backoff and will give up at a
  1597  // given limit. It returns whether the task was destroyed and the error
  1598  // associated with the last kill attempt.
  1599  func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) {
  1600  	// Cap the number of times we attempt to kill the task.
  1601  	for i := 0; i < killFailureLimit; i++ {
  1602  		if err = handle.Kill(); err != nil {
  1603  			// Calculate the new backoff
  1604  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
  1605  			if backoff > killBackoffLimit {
  1606  				backoff = killBackoffLimit
  1607  			}
  1608  
  1609  			r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v",
  1610  				r.task.Name, r.alloc.ID, backoff, err)
  1611  			time.Sleep(time.Duration(backoff))
  1612  		} else {
  1613  			// Kill was successful
  1614  			return true, nil
  1615  		}
  1616  	}
  1617  	return
  1618  }
  1619  
  1620  // Restart will restart the task
  1621  func (r *TaskRunner) Restart(source, reason string) {
  1622  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1623  	event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr)
  1624  
  1625  	select {
  1626  	case r.restartCh <- event:
  1627  	case <-r.waitCh:
  1628  	}
  1629  }
  1630  
  1631  // Signal will send a signal to the task
  1632  func (r *TaskRunner) Signal(source, reason string, s os.Signal) error {
  1633  
  1634  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1635  	event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr)
  1636  
  1637  	resCh := make(chan error)
  1638  	se := SignalEvent{
  1639  		s:      s,
  1640  		e:      event,
  1641  		result: resCh,
  1642  	}
  1643  
  1644  	select {
  1645  	case r.signalCh <- se:
  1646  	case <-r.waitCh:
  1647  	}
  1648  
  1649  	return <-resCh
  1650  }
  1651  
  1652  // Kill will kill a task and store the error, no longer restarting the task. If
  1653  // fail is set, the task is marked as having failed.
  1654  func (r *TaskRunner) Kill(source, reason string, fail bool) {
  1655  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1656  	event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr)
  1657  	if fail {
  1658  		event.SetFailsTask()
  1659  	}
  1660  
  1661  	r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr)
  1662  	r.Destroy(event)
  1663  }
  1664  
  1665  // UnblockStart unblocks the starting of the task. It currently assumes only
  1666  // consul-template will unblock
  1667  func (r *TaskRunner) UnblockStart(source string) {
  1668  	r.unblockLock.Lock()
  1669  	defer r.unblockLock.Unlock()
  1670  	if r.unblocked {
  1671  		return
  1672  	}
  1673  
  1674  	r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source)
  1675  	r.unblocked = true
  1676  	close(r.unblockCh)
  1677  }
  1678  
  1679  // Helper function for converting a WaitResult into a TaskTerminated event.
  1680  func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent {
  1681  	return structs.NewTaskEvent(structs.TaskTerminated).
  1682  		SetExitCode(res.ExitCode).
  1683  		SetSignal(res.Signal).
  1684  		SetExitMessage(res.Err)
  1685  }
  1686  
  1687  // Update is used to update the task of the context
  1688  func (r *TaskRunner) Update(update *structs.Allocation) {
  1689  	select {
  1690  	case r.updateCh <- update:
  1691  	default:
  1692  		r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
  1693  			r.task.Name, r.alloc.ID)
  1694  	}
  1695  }
  1696  
  1697  // Destroy is used to indicate that the task context should be destroyed. The
  1698  // event parameter provides a context for the destroy.
  1699  func (r *TaskRunner) Destroy(event *structs.TaskEvent) {
  1700  	r.destroyLock.Lock()
  1701  	defer r.destroyLock.Unlock()
  1702  
  1703  	if r.destroy {
  1704  		return
  1705  	}
  1706  	r.destroy = true
  1707  	r.destroyEvent = event
  1708  	close(r.destroyCh)
  1709  }
  1710  
  1711  // getCreatedResources returns the resources created by drivers. It will never
  1712  // return nil.
  1713  func (r *TaskRunner) getCreatedResources() *driver.CreatedResources {
  1714  	r.createdResourcesLock.Lock()
  1715  	if r.createdResources == nil {
  1716  		r.createdResources = driver.NewCreatedResources()
  1717  	}
  1718  	cr := r.createdResources.Copy()
  1719  	r.createdResourcesLock.Unlock()
  1720  
  1721  	return cr
  1722  }
  1723  
  1724  // setCreatedResources updates the resources created by drivers. If passed nil
  1725  // it will set createdResources to an initialized struct.
  1726  func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) {
  1727  	if cr == nil {
  1728  		cr = driver.NewCreatedResources()
  1729  	}
  1730  	r.createdResourcesLock.Lock()
  1731  	r.createdResources = cr.Copy()
  1732  	r.createdResourcesLock.Unlock()
  1733  }
  1734  
  1735  // emitStats emits resource usage stats of tasks to remote metrics collector
  1736  // sinks
  1737  func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1738  	if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics {
  1739  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1740  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1741  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1742  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1743  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1744  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1745  	}
  1746  
  1747  	if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics {
  1748  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1749  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1750  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1751  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1752  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1753  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1754  	}
  1755  }