
     1  package client
     3  import (
     4  	"bytes"
     5  	"crypto/md5"
     6  	"encoding/hex"
     7  	"fmt"
     8  	"io"
     9  	"io/ioutil"
    10  	"log"
    11  	"os"
    12  	"path/filepath"
    13  	"strings"
    14  	"sync"
    15  	"time"
    17  	metrics ""
    18  	""
    19  	""
    20  	""
    21  	""
    22  	version ""
    23  	""
    24  	""
    25  	""
    26  	""
    27  	""
    28  	""
    29  	""
    31  	""
    32  	dstructs ""
    33  	cstructs ""
    34  )
    36  const (
    37  	// killBackoffBaseline is the baseline time for exponential backoff while
    38  	// killing a task.
    39  	killBackoffBaseline = 5 * time.Second
    41  	// killBackoffLimit is the limit of the exponential backoff for killing
    42  	// the task.
    43  	killBackoffLimit = 2 * time.Minute
    45  	// killFailureLimit is how many times we will attempt to kill a task before
    46  	// giving up and potentially leaking resources.
    47  	killFailureLimit = 5
    49  	// vaultBackoffBaseline is the baseline time for exponential backoff when
    50  	// attempting to retrieve a Vault token
    51  	vaultBackoffBaseline = 5 * time.Second
    53  	// vaultBackoffLimit is the limit of the exponential backoff when attempting
    54  	// to retrieve a Vault token
    55  	vaultBackoffLimit = 3 * time.Minute
    57  	// vaultTokenFile is the name of the file holding the Vault token inside the
    58  	// task's secret directory
    59  	vaultTokenFile = "vault_token"
    60  )
    62  var (
    63  	// taskRunnerStateAllKey holds all the task runners state. At the moment
    64  	// there is no need to split it
    65  	taskRunnerStateAllKey = []byte("simple-all")
    66  )
    68  // taskRestartEvent wraps a TaskEvent with additional metadata to control
    69  // restart behavior.
    70  type taskRestartEvent struct {
    71  	// taskEvent to report
    72  	taskEvent *structs.TaskEvent
    74  	// if false, don't count against restart count
    75  	failure bool
    76  }
    78  func newTaskRestartEvent(reason string, failure bool) *taskRestartEvent {
    79  	return &taskRestartEvent{
    80  		taskEvent: structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason),
    81  		failure:   failure,
    82  	}
    83  }
    85  // TaskRunner is used to wrap a task within an allocation and provide the execution context.
    86  type TaskRunner struct {
    87  	stateDB        *bolt.DB
    88  	config         *config.Config
    89  	updater        TaskStateUpdater
    90  	logger         *log.Logger
    91  	restartTracker *RestartTracker
    92  	consul         ConsulServiceAPI
    94  	// running marks whether the task is running
    95  	running     bool
    96  	runningLock sync.Mutex
    98  	resourceUsage     *cstructs.TaskResourceUsage
    99  	resourceUsageLock sync.RWMutex
   101  	alloc   *structs.Allocation
   102  	task    *structs.Task
   103  	taskDir *allocdir.TaskDir
   105  	// envBuilder is used to build the task's environment
   106  	envBuilder *env.Builder
   108  	// driverNet is the network information returned by the driver
   109  	driverNet     *cstructs.DriverNetwork
   110  	driverNetLock sync.Mutex
   112  	// updateCh is used to receive updated versions of the allocation
   113  	updateCh chan *structs.Allocation
   115  	handle     driver.DriverHandle
   116  	handleLock sync.Mutex
   118  	// artifactsDownloaded tracks whether the tasks artifacts have been
   119  	// downloaded
   120  	//
   121  	// Must acquire persistLock when accessing
   122  	artifactsDownloaded bool
   124  	// taskDirBuilt tracks whether the task has built its directory.
   125  	//
   126  	// Must acquire persistLock when accessing
   127  	taskDirBuilt bool
   129  	// createdResources are all the resources created by the task driver
   130  	// across all attempts to start the task.
   131  	// Simple gets and sets should use {get,set}CreatedResources
   132  	createdResources     *driver.CreatedResources
   133  	createdResourcesLock sync.Mutex
   135  	// payloadRendered tracks whether the payload has been rendered to disk
   136  	payloadRendered bool
   138  	// vaultFuture is the means to wait for and get a Vault token
   139  	vaultFuture *tokenFuture
   141  	// recoveredVaultToken is the token that was recovered through a restore
   142  	recoveredVaultToken string
   144  	// vaultClient is used to retrieve and renew any needed Vault token
   145  	vaultClient vaultclient.VaultClient
   147  	// templateManager is used to manage any consul-templates this task may have
   148  	templateManager *TaskTemplateManager
   150  	// startCh is used to trigger the start of the task
   151  	startCh chan struct{}
   153  	// unblockCh is used to unblock the starting of the task
   154  	unblockCh   chan struct{}
   155  	unblocked   bool
   156  	unblockLock sync.Mutex
   158  	// restartCh is used to restart a task
   159  	restartCh chan *taskRestartEvent
   161  	// signalCh is used to send a signal to a task
   162  	signalCh chan SignalEvent
   164  	destroy      bool
   165  	destroyCh    chan struct{}
   166  	destroyLock  sync.Mutex
   167  	destroyEvent *structs.TaskEvent
   169  	// waitCh closing marks the run loop as having exited
   170  	waitCh chan struct{}
   172  	// persistLock must be acquired when accessing fields stored by
   173  	// SaveState. SaveState is called asynchronously to TaskRunner.Run by
   174  	// AllocRunner, so all state fields must be synchronized using this
   175  	// lock.
   176  	persistLock sync.Mutex
   178  	// persistedHash is the hash of the last persisted snapshot. It is used to
   179  	// detect if a new snapshot has to be written to disk.
   180  	persistedHash []byte
   182  	// baseLabels are used when emitting tagged metrics. All task runner metrics
   183  	// will have these tags, and optionally more.
   184  	baseLabels []metrics.Label
   185  }
   187  // taskRunnerState is used to snapshot the state of the task runner
   188  type taskRunnerState struct {
   189  	Version            string
   190  	HandleID           string
   191  	ArtifactDownloaded bool
   192  	TaskDirBuilt       bool
   193  	PayloadRendered    bool
   194  	CreatedResources   *driver.CreatedResources
   195  	DriverNetwork      *cstructs.DriverNetwork
   196  }
   198  func (s *taskRunnerState) Hash() []byte {
   199  	h := md5.New()
   201  	io.WriteString(h, s.Version)
   202  	io.WriteString(h, s.HandleID)
   203  	io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded))
   204  	io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt))
   205  	io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered))
   206  	h.Write(s.CreatedResources.Hash())
   207  	h.Write(s.DriverNetwork.Hash())
   209  	return h.Sum(nil)
   210  }
   212  // TaskStateUpdater is used to signal that tasks state has changed. If lazySync
   213  // is set the event won't be immediately pushed to the server.
   214  type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent, lazySync bool)
   216  // SignalEvent is a tuple of the signal and the event generating it
   217  type SignalEvent struct {
   218  	// s is the signal to be sent
   219  	s os.Signal
   221  	// e is the task event generating the signal
   222  	e *structs.TaskEvent
   224  	// result should be used to send back the result of the signal
   225  	result chan<- error
   226  }
   228  // NewTaskRunner is used to create a new task context
   229  func NewTaskRunner(logger *log.Logger, config *config.Config,
   230  	stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir,
   231  	alloc *structs.Allocation, task *structs.Task,
   232  	vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner {
   234  	// Merge in the task resources
   235  	task.Resources = alloc.TaskResources[task.Name]
   237  	// Build the restart tracker.
   238  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   239  	if tg == nil {
   240  		logger.Printf("[ERR] client: alloc %q for missing task group %q", alloc.ID, alloc.TaskGroup)
   241  		return nil
   242  	}
   243  	restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type)
   245  	// Initialize the environment builder
   246  	envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region)
   248  	tc := &TaskRunner{
   249  		config:           config,
   250  		stateDB:          stateDB,
   251  		updater:          updater,
   252  		logger:           logger,
   253  		restartTracker:   restartTracker,
   254  		alloc:            alloc,
   255  		task:             task,
   256  		taskDir:          taskDir,
   257  		envBuilder:       envBuilder,
   258  		createdResources: driver.NewCreatedResources(),
   259  		consul:           consulClient,
   260  		vaultClient:      vaultClient,
   261  		vaultFuture:      NewTokenFuture().Set(""),
   262  		updateCh:         make(chan *structs.Allocation, 64),
   263  		destroyCh:        make(chan struct{}),
   264  		waitCh:           make(chan struct{}),
   265  		startCh:          make(chan struct{}, 1),
   266  		unblockCh:        make(chan struct{}),
   267  		restartCh:        make(chan *taskRestartEvent),
   268  		signalCh:         make(chan SignalEvent),
   269  	}
   271  	tc.baseLabels = []metrics.Label{
   272  		{
   273  			Name:  "job",
   274  			Value: tc.alloc.Job.Name,
   275  		},
   276  		{
   277  			Name:  "task_group",
   278  			Value: tc.alloc.TaskGroup,
   279  		},
   280  		{
   281  			Name:  "alloc_id",
   282  			Value: tc.alloc.ID,
   283  		},
   284  		{
   285  			Name:  "task",
   286  			Value: tc.task.Name,
   287  		},
   288  	}
   290  	return tc
   291  }
   293  // MarkReceived marks the task as received.
   294  func (r *TaskRunner) MarkReceived() {
   295  	// We lazy sync this since there will be a follow up message almost
   296  	// immediately.
   297  	r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived), true)
   298  }
   300  // WaitCh returns a channel to wait for termination
   301  func (r *TaskRunner) WaitCh() <-chan struct{} {
   302  	return r.waitCh
   303  }
   305  // getHandle returns the task's handle or nil
   306  func (r *TaskRunner) getHandle() driver.DriverHandle {
   307  	r.handleLock.Lock()
   308  	h := r.handle
   309  	r.handleLock.Unlock()
   310  	return h
   311  }
   313  // pre060StateFilePath returns the path to our state file that would have been
   314  // written pre v0.6.0
   315  // COMPAT: Remove in 0.7.0
   316  func (r *TaskRunner) pre060StateFilePath() string {
   317  	// Get the MD5 of the task name
   318  	hashVal := md5.Sum([]byte(r.task.Name))
   319  	hashHex := hex.EncodeToString(hashVal[:])
   320  	dirName := fmt.Sprintf("task-%s", hashHex)
   322  	// Generate the path
   323  	return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json")
   324  }
   326  // RestoreState is used to restore our state. If a non-empty string is returned
   327  // the task is restarted with the string as the reason. This is useful for
   328  // backwards incompatible upgrades that need to restart tasks with a new
   329  // executor.
   330  func (r *TaskRunner) RestoreState() (string, error) {
   331  	// COMPAT: Remove in 0.7.0
   332  	// 0.6.0 transitioned from individual state files to a single bolt-db.
   333  	// The upgrade path is to:
   334  	// Check if old state exists
   335  	//   If so, restore from that and delete old state
   336  	// Restore using state database
   338  	var snap taskRunnerState
   340  	// Check if the old snapshot is there
   341  	oldPath := r.pre060StateFilePath()
   342  	if err := pre060RestoreState(oldPath, &snap); err == nil {
   343  		// Delete the old state
   344  		os.RemoveAll(oldPath)
   345  	} else if !os.IsNotExist(err) {
   346  		// Something corrupt in the old state file
   347  		return "", err
   348  	} else {
   349  		// We are doing a normal restore
   350  		err := r.stateDB.View(func(tx *bolt.Tx) error {
   351  			bkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
   352  			if err != nil {
   353  				return fmt.Errorf("failed to get task bucket: %v", err)
   354  			}
   356  			if err := getObject(bkt, taskRunnerStateAllKey, &snap); err != nil {
   357  				return fmt.Errorf("failed to read task runner state: %v", err)
   358  			}
   359  			return nil
   360  		})
   361  		if err != nil {
   362  			return "", err
   363  		}
   365  	}
   367  	// Restore fields from the snapshot
   368  	r.artifactsDownloaded = snap.ArtifactDownloaded
   369  	r.taskDirBuilt = snap.TaskDirBuilt
   370  	r.payloadRendered = snap.PayloadRendered
   371  	r.setCreatedResources(snap.CreatedResources)
   372  	r.driverNet = snap.DriverNetwork
   374  	if r.task.Vault != nil {
   375  		// Read the token from the secret directory
   376  		tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   377  		data, err := ioutil.ReadFile(tokenPath)
   378  		if err != nil {
   379  			if !os.IsNotExist(err) {
   380  				return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   381  			}
   383  			// Token file doesn't exist
   384  		} else {
   385  			// Store the recovered token
   386  			r.recoveredVaultToken = string(data)
   387  		}
   388  	}
   390  	// Restore the driver
   391  	restartReason := ""
   392  	if snap.HandleID != "" {
   393  		d, err := r.createDriver()
   394  		if err != nil {
   395  			return "", err
   396  		}
   398  		// Add the restored network driver to the environment
   399  		r.envBuilder.SetDriverNetwork(r.driverNet)
   401  		// Open a connection to the driver handle
   402  		ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
   403  		handle, err := d.Open(ctx, snap.HandleID)
   405  		// In the case it fails, we relaunch the task in the Run() method.
   406  		if err != nil {
   407  			r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v",
   408  				r.task.Name, r.alloc.ID, err)
   409  			return "", nil
   410  		}
   412  		if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) {
   413  			restartReason = pre06ScriptCheckReason
   414  		}
   416  		if err := r.registerServices(d, handle, r.driverNet); err != nil {
   417  			// Don't hard fail here as there's a chance this task
   418  			// registered with Consul properly when it initial
   419  			// started.
   420  			r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v",
   421  				r.task.Name, r.alloc.ID, err)
   422  		}
   424  		r.handleLock.Lock()
   425  		r.handle = handle
   426  		r.handleLock.Unlock()
   428  		r.runningLock.Lock()
   429  		r.running = true
   430  		r.runningLock.Unlock()
   431  	}
   432  	return restartReason, nil
   433  }
   435  // ver06 is used for checking for pre-0.6 script checks
   436  var ver06 = version.Must(version.NewVersion("0.6.0dev"))
   438  // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script
   439  // check is found on an exec/java task.
   440  const pre06ScriptCheckReason = "upgrading pre-0.6 script checks"
   442  // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script
   443  // check, and uses exec or java drivers.
   444  func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool {
   445  	if driver != "exec" && driver != "java" && driver != "mock_driver" {
   446  		// Only exec and java are affected
   447  		return false
   448  	}
   449  	v, err := version.NewVersion(ver)
   450  	if err != nil {
   451  		// Treat it as old
   452  		return true
   453  	}
   454  	if !v.LessThan(ver06) {
   455  		// >= 0.6.0dev
   456  		return false
   457  	}
   458  	for _, service := range services {
   459  		for _, check := range service.Checks {
   460  			if check.Type == "script" {
   461  				return true
   462  			}
   463  		}
   464  	}
   465  	return false
   466  }
   468  // SaveState is used to snapshot our state
   469  func (r *TaskRunner) SaveState() error {
   470  	r.destroyLock.Lock()
   471  	defer r.destroyLock.Unlock()
   472  	if r.destroy {
   473  		// Don't save state if already destroyed
   474  		return nil
   475  	}
   477  	r.persistLock.Lock()
   478  	defer r.persistLock.Unlock()
   479  	snap := taskRunnerState{
   480  		Version:            r.config.Version.VersionNumber(),
   481  		ArtifactDownloaded: r.artifactsDownloaded,
   482  		TaskDirBuilt:       r.taskDirBuilt,
   483  		PayloadRendered:    r.payloadRendered,
   484  		CreatedResources:   r.getCreatedResources(),
   485  	}
   487  	r.handleLock.Lock()
   488  	if r.handle != nil {
   489  		snap.HandleID = r.handle.ID()
   490  	}
   491  	r.handleLock.Unlock()
   493  	r.driverNetLock.Lock()
   494  	snap.DriverNetwork = r.driverNet.Copy()
   495  	r.driverNetLock.Unlock()
   497  	// If nothing has changed avoid the write
   498  	h := snap.Hash()
   499  	if bytes.Equal(h, r.persistedHash) {
   500  		return nil
   501  	}
   503  	// Serialize the object
   504  	var buf bytes.Buffer
   505  	if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil {
   506  		return fmt.Errorf("failed to serialize snapshot: %v", err)
   507  	}
   509  	// Start the transaction.
   510  	return r.stateDB.Batch(func(tx *bolt.Tx) error {
   511  		// Grab the task bucket
   512  		taskBkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
   513  		if err != nil {
   514  			return fmt.Errorf("failed to retrieve allocation bucket: %v", err)
   515  		}
   517  		if err := putData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil {
   518  			return fmt.Errorf("failed to write task_runner state: %v", err)
   519  		}
   521  		// Store the hash that was persisted
   522  		tx.OnCommit(func() {
   523  			r.persistedHash = h
   524  		})
   526  		return nil
   527  	})
   528  }
   530  // DestroyState is used to cleanup after ourselves
   531  func (r *TaskRunner) DestroyState() error {
   532  	r.persistLock.Lock()
   533  	defer r.persistLock.Unlock()
   535  	return r.stateDB.Update(func(tx *bolt.Tx) error {
   536  		if err := deleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil {
   537  			return fmt.Errorf("failed to delete task bucket: %v", err)
   538  		}
   539  		return nil
   540  	})
   541  }
   543  // setState is used to update the state of the task runner
   544  func (r *TaskRunner) setState(state string, event *structs.TaskEvent, lazySync bool) {
   545  	event.PopulateEventDisplayMessage()
   547  	// Persist our state to disk.
   548  	if err := r.SaveState(); err != nil {
   549  		r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err)
   550  	}
   552  	// Indicate the task has been updated.
   553  	r.updater(r.task.Name, state, event, lazySync)
   554  }
   556  // createDriver makes a driver for the task
   557  func (r *TaskRunner) createDriver() (driver.Driver, error) {
   558  	// Create a task-specific event emitter callback to expose minimal
   559  	// state to drivers
   560  	eventEmitter := func(m string, args ...interface{}) {
   561  		msg := fmt.Sprintf(m, args...)
   562  		r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg)
   563  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg), false)
   564  	}
   566  	driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter)
   567  	d, err := driver.NewDriver(r.task.Driver, driverCtx)
   568  	if err != nil {
   569  		return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
   570  			r.task.Driver, r.alloc.ID, err)
   571  	}
   573  	return d, err
   574  }
   576  // Run is a long running routine used to manage the task
   577  func (r *TaskRunner) Run() {
   578  	defer close(r.waitCh)
   579  	r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
   580  		r.task.Name, r.alloc.ID)
   582  	if err := r.validateTask(); err != nil {
   583  		r.setState(
   584  			structs.TaskStateDead,
   585  			structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask(),
   586  			false)
   587  		return
   588  	}
   590  	// Create a temporary driver so that we can determine the FSIsolation
   591  	// required. run->startTask will create a new driver after environment
   592  	// has been setup (env vars, templates, artifacts, secrets, etc).
   593  	tmpDrv, err := r.createDriver()
   594  	if err != nil {
   595  		e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   596  		r.setState(
   597  			structs.TaskStateDead,
   598  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
   599  			false)
   600  		return
   601  	}
   603  	// Build base task directory structure regardless of FS isolation abilities.
   604  	// This needs to happen before we start the Vault manager and call prestart
   605  	// as both those can write to the task directories
   606  	if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil {
   607  		e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err)
   608  		r.setState(
   609  			structs.TaskStateDead,
   610  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
   611  			false)
   612  		return
   613  	}
   615  	// If there is no Vault policy leave the static future created in
   616  	// NewTaskRunner
   617  	if r.task.Vault != nil {
   618  		// Start the go-routine to get a Vault token
   619  		r.vaultFuture.Clear()
   620  		go r.vaultManager(r.recoveredVaultToken)
   621  	}
   623  	// Start the run loop
   626  	// Do any cleanup necessary
   627  	r.postrun()
   629  	return
   630  }
   632  // validateTask validates the fields of the task and returns an error if the
   633  // task is invalid.
   634  func (r *TaskRunner) validateTask() error {
   635  	var mErr multierror.Error
   637  	// Validate the user.
   638  	unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist)
   639  	checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers)
   640  	if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch {
   641  		if _, unallowed := unallowedUsers[r.task.User]; unallowed {
   642  			mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User))
   643  		}
   644  	}
   646  	// Validate the artifacts
   647  	for i, artifact := range r.task.Artifacts {
   648  		// Verify the artifact doesn't escape the task directory.
   649  		if err := artifact.Validate(); err != nil {
   650  			// If this error occurs there is potentially a server bug or
   651  			// malicious, server spoofing.
   652  			r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v",
   653  				r.alloc.ID, r.task.Name, artifact, i, err)
   654  			mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err))
   655  		}
   656  	}
   658  	// Validate the Service names
   659  	taskEnv := r.envBuilder.Build()
   660  	for i, service := range r.task.Services {
   661  		name := taskEnv.ReplaceEnv(service.Name)
   662  		if err := service.ValidateName(name); err != nil {
   663  			mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err))
   664  		}
   665  	}
   667  	if len(mErr.Errors) == 1 {
   668  		return mErr.Errors[0]
   669  	}
   670  	return mErr.ErrorOrNil()
   671  }
   673  // tokenFuture stores the Vault token and allows consumers to block till a valid
   674  // token exists
   675  type tokenFuture struct {
   676  	waiting []chan struct{}
   677  	token   string
   678  	set     bool
   679  	m       sync.Mutex
   680  }
   682  // NewTokenFuture returns a new token future without any token set
   683  func NewTokenFuture() *tokenFuture {
   684  	return &tokenFuture{}
   685  }
   687  // Wait returns a channel that can be waited on. When this channel unblocks, a
   688  // valid token will be available via the Get method
   689  func (f *tokenFuture) Wait() <-chan struct{} {
   690  	f.m.Lock()
   691  	defer f.m.Unlock()
   693  	c := make(chan struct{})
   694  	if f.set {
   695  		close(c)
   696  		return c
   697  	}
   699  	f.waiting = append(f.waiting, c)
   700  	return c
   701  }
   703  // Set sets the token value and unblocks any caller of Wait
   704  func (f *tokenFuture) Set(token string) *tokenFuture {
   705  	f.m.Lock()
   706  	defer f.m.Unlock()
   708  	f.set = true
   709  	f.token = token
   710  	for _, w := range f.waiting {
   711  		close(w)
   712  	}
   713  	f.waiting = nil
   714  	return f
   715  }
   717  // Clear clears the set vault token.
   718  func (f *tokenFuture) Clear() *tokenFuture {
   719  	f.m.Lock()
   720  	defer f.m.Unlock()
   722  	f.token = ""
   723  	f.set = false
   724  	return f
   725  }
   727  // Get returns the set Vault token
   728  func (f *tokenFuture) Get() string {
   729  	f.m.Lock()
   730  	defer f.m.Unlock()
   731  	return f.token
   732  }
   734  // vaultManager should be called in a go-routine and manages the derivation,
   735  // renewal and handling of errors with the Vault token. The optional parameter
   736  // allows setting the initial Vault token. This is useful when the Vault token
   737  // is recovered off disk.
   738  func (r *TaskRunner) vaultManager(token string) {
   739  	// Helper for stopping token renewal
   740  	stopRenewal := func() {
   741  		if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil {
   742  			r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   743  		}
   744  	}
   746  	// updatedToken lets us store state between loops. If true, a new token
   747  	// has been retrieved and we need to apply the Vault change mode
   748  	var updatedToken bool
   750  OUTER:
   751  	for {
   752  		// Check if we should exit
   753  		select {
   754  		case <-r.waitCh:
   755  			stopRenewal()
   756  			return
   757  		default:
   758  		}
   760  		// Clear the token
   761  		r.vaultFuture.Clear()
   763  		// Check if there already is a token which can be the case for
   764  		// restoring the TaskRunner
   765  		if token == "" {
   766  			// Get a token
   767  			var exit bool
   768  			token, exit = r.deriveVaultToken()
   769  			if exit {
   770  				// Exit the manager
   771  				return
   772  			}
   774  			// Write the token to disk
   775  			if err := r.writeToken(token); err != nil {
   776  				e := fmt.Errorf("failed to write Vault token to disk")
   777  				r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err)
   778  				r.Kill("vault", e.Error(), true)
   779  				return
   780  			}
   781  		}
   783  		// Start the renewal process
   784  		renewCh, err := r.vaultClient.RenewToken(token, 30)
   786  		// An error returned means the token is not being renewed
   787  		if err != nil {
   788  			r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   789  			token = ""
   790  			goto OUTER
   791  		}
   793  		// The Vault token is valid now, so set it
   794  		r.vaultFuture.Set(token)
   796  		if updatedToken {
   797  			switch r.task.Vault.ChangeMode {
   798  			case structs.VaultChangeModeSignal:
   799  				s, err := signals.Parse(r.task.Vault.ChangeSignal)
   800  				if err != nil {
   801  					e := fmt.Errorf("failed to parse signal: %v", err)
   802  					r.logger.Printf("[ERR] client: %v", err)
   803  					r.Kill("vault", e.Error(), true)
   804  					return
   805  				}
   807  				if err := r.Signal("vault", "new Vault token acquired", s); err != nil {
   808  					r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   809  					r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true)
   810  					return
   811  				}
   812  			case structs.VaultChangeModeRestart:
   813  				const noFailure = false
   814  				r.Restart("vault", "new Vault token acquired", noFailure)
   815  			case structs.VaultChangeModeNoop:
   816  				fallthrough
   817  			default:
   818  				r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode)
   819  			}
   821  			// We have handled it
   822  			updatedToken = false
   824  			// Call the handler
   825  			r.updatedTokenHandler()
   826  		}
   828  		// Start watching for renewal errors
   829  		select {
   830  		case err := <-renewCh:
   831  			// Clear the token
   832  			token = ""
   833  			r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   834  			stopRenewal()
   836  			// Check if we have to do anything
   837  			if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop {
   838  				updatedToken = true
   839  			}
   840  		case <-r.waitCh:
   841  			stopRenewal()
   842  			return
   843  		}
   844  	}
   845  }
   847  // deriveVaultToken derives the Vault token using exponential backoffs. It
   848  // returns the Vault token and whether the manager should exit.
   849  func (r *TaskRunner) deriveVaultToken() (token string, exit bool) {
   850  	attempts := 0
   851  	for {
   852  		tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name})
   853  		if err == nil {
   854  			return tokens[r.task.Name], false
   855  		}
   857  		// Check if this is a server side error
   858  		if structs.IsServerSide(err) {
   859  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   860  				r.task.Name, r.alloc.ID, err)
   861  			r.Kill("vault", fmt.Sprintf("server error deriving vault token: %v", err), true)
   862  			return "", true
   863  		}
   864  		// Check if we can't recover from the error
   865  		if !structs.IsRecoverable(err) {
   866  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   867  				r.task.Name, r.alloc.ID, err)
   868  			r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true)
   869  			return "", true
   870  		}
   872  		// Handle the retry case
   873  		backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
   874  		if backoff > vaultBackoffLimit {
   875  			backoff = vaultBackoffLimit
   876  		}
   877  		r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v",
   878  			r.task.Name, r.alloc.ID, err, backoff)
   880  		attempts++
   882  		// Wait till retrying
   883  		select {
   884  		case <-r.waitCh:
   885  			return "", true
   886  		case <-time.After(backoff):
   887  		}
   888  	}
   889  }
   891  // writeToken writes the given token to disk
   892  func (r *TaskRunner) writeToken(token string) error {
   893  	tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   894  	if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil {
   895  		return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   896  	}
   898  	return nil
   899  }
   901  // updatedTokenHandler is called when a new Vault token is retrieved. Things
   902  // that rely on the token should be updated here.
   903  func (r *TaskRunner) updatedTokenHandler() {
   905  	// Update the tasks environment
   906  	r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env)
   908  	if r.templateManager != nil {
   909  		r.templateManager.Stop()
   911  		// Create a new templateManager
   912  		var err error
   913  		r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
   914  			Hooks:                r,
   915  			Templates:            r.task.Templates,
   916  			ClientConfig:         r.config,
   917  			VaultToken:           r.vaultFuture.Get(),
   918  			TaskDir:              r.taskDir.Dir,
   919  			EnvBuilder:           r.envBuilder,
   920  			MaxTemplateEventRate: DefaultMaxTemplateEventRate,
   921  		})
   923  		if err != nil {
   924  			err := fmt.Errorf("failed to build task's template manager: %v", err)
   925  			r.setState(structs.TaskStateDead,
   926  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   927  				false)
   928  			r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
   929  			r.Kill("vault", err.Error(), true)
   930  			return
   931  		}
   932  	}
   933  }
   935  // prestart handles life-cycle tasks that occur before the task has started.
   936  // Since it's run asynchronously with the main Run() loop the alloc & task are
   937  // passed in to avoid racing with updates.
   938  func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) {
   939  	if task.Vault != nil {
   940  		// Wait for the token
   941  		r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID)
   942  		tokenCh := r.vaultFuture.Wait()
   943  		select {
   944  		case <-tokenCh:
   945  		case <-r.waitCh:
   946  			resultCh <- false
   947  			return
   948  		}
   949  		r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID)
   950  		r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env)
   951  	}
   953  	// If the job is a dispatch job and there is a payload write it to disk
   954  	requirePayload := len(alloc.Job.Payload) != 0 &&
   955  		(r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "")
   956  	if !r.payloadRendered && requirePayload {
   957  		renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File)
   958  		decoded, err := snappy.Decode(nil, alloc.Job.Payload)
   959  		if err != nil {
   960  			r.setState(
   961  				structs.TaskStateDead,
   962  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   963  				false)
   964  			resultCh <- false
   965  			return
   966  		}
   968  		if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil {
   969  			r.setState(
   970  				structs.TaskStateDead,
   971  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   972  				false)
   973  			resultCh <- false
   974  			return
   975  		}
   977  		if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil {
   978  			r.setState(
   979  				structs.TaskStateDead,
   980  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   981  				false)
   982  			resultCh <- false
   983  			return
   984  		}
   986  		r.payloadRendered = true
   987  	}
   989  	for {
   990  		r.persistLock.Lock()
   991  		downloaded := r.artifactsDownloaded
   992  		r.persistLock.Unlock()
   994  		// Download the task's artifacts
   995  		if !downloaded && len(task.Artifacts) > 0 {
   996  			r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts), false)
   997  			taskEnv := r.envBuilder.Build()
   998  			for _, artifact := range task.Artifacts {
   999  				if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil {
  1000  					wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err)
  1001  					r.logger.Printf("[DEBUG] client: %v", wrapped)
  1002  					r.setState(structs.TaskStatePending,
  1003  						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped), false)
  1004  					r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err))
  1005  					goto RESTART
  1006  				}
  1007  			}
  1009  			r.persistLock.Lock()
  1010  			r.artifactsDownloaded = true
  1011  			r.persistLock.Unlock()
  1012  		}
  1014  		// We don't have to wait for any template
  1015  		if len(task.Templates) == 0 {
  1016  			// Send the start signal
  1017  			select {
  1018  			case r.startCh <- struct{}{}:
  1019  			default:
  1020  			}
  1022  			resultCh <- true
  1023  			return
  1024  		}
  1026  		// Build the template manager
  1027  		if r.templateManager == nil {
  1028  			var err error
  1029  			r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
  1030  				Hooks:                r,
  1031  				Templates:            r.task.Templates,
  1032  				ClientConfig:         r.config,
  1033  				VaultToken:           r.vaultFuture.Get(),
  1034  				TaskDir:              r.taskDir.Dir,
  1035  				EnvBuilder:           r.envBuilder,
  1036  				MaxTemplateEventRate: DefaultMaxTemplateEventRate,
  1037  			})
  1038  			if err != nil {
  1039  				err := fmt.Errorf("failed to build task's template manager: %v", err)
  1040  				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), false)
  1041  				r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err)
  1042  				resultCh <- false
  1043  				return
  1044  			}
  1045  		}
  1047  		// Block for consul-template
  1048  		// TODO Hooks should register themselves as blocking and then we can
  1049  		// periodically enumerate what we are still blocked on
  1050  		select {
  1051  		case <-r.unblockCh:
  1052  			// Send the start signal
  1053  			select {
  1054  			case r.startCh <- struct{}{}:
  1055  			default:
  1056  			}
  1058  			resultCh <- true
  1059  			return
  1060  		case <-r.waitCh:
  1061  			// The run loop has exited so exit too
  1062  			resultCh <- false
  1063  			return
  1064  		}
  1066  	RESTART:
  1067  		restart := r.shouldRestart()
  1068  		if !restart {
  1069  			resultCh <- false
  1070  			return
  1071  		}
  1072  	}
  1073  }
  1075  // postrun is used to do any cleanup that is necessary after exiting the runloop
  1076  func (r *TaskRunner) postrun() {
  1077  	// Stop the template manager
  1078  	if r.templateManager != nil {
  1079  		r.templateManager.Stop()
  1080  	}
  1081  }
  1083  // run is the main run loop that handles starting the application, destroying
  1084  // it, restarts and signals.
  1085  func (r *TaskRunner) run() {
  1086  	// Predeclare things so we can jump to the RESTART
  1087  	var stopCollection chan struct{}
  1088  	var handleWaitCh chan *dstructs.WaitResult
  1090  	// If we already have a handle, populate the stopCollection and handleWaitCh
  1091  	// to fix the invariant that it exists.
  1092  	handleEmpty := r.getHandle() == nil
  1094  	if !handleEmpty {
  1095  		stopCollection = make(chan struct{})
  1096  		go r.collectResourceUsageStats(stopCollection)
  1097  		handleWaitCh = r.handle.WaitCh()
  1098  	}
  1100  	for {
  1101  		// Do the prestart activities
  1102  		prestartResultCh := make(chan bool, 1)
  1103  		go r.prestart(r.alloc, r.task, prestartResultCh)
  1105  	WAIT:
  1106  		for {
  1107  			select {
  1108  			case success := <-prestartResultCh:
  1109  				if !success {
  1110  					r.cleanup()
  1111  					r.setState(structs.TaskStateDead, nil, false)
  1112  					return
  1113  				}
  1114  			case <-r.startCh:
  1115  				// Start the task if not yet started or it is being forced. This logic
  1116  				// is necessary because in the case of a restore the handle already
  1117  				// exists.
  1118  				handleEmpty := r.getHandle() == nil
  1119  				if handleEmpty {
  1120  					startErr := r.startTask()
  1121  					r.restartTracker.SetStartError(startErr)
  1122  					if startErr != nil {
  1123  						r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr), true)
  1124  						goto RESTART
  1125  					}
  1127  					// Mark the task as started
  1128  					r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted), false)
  1129  					r.runningLock.Lock()
  1130  					r.running = true
  1131  					r.runningLock.Unlock()
  1133  					if stopCollection == nil {
  1134  						stopCollection = make(chan struct{})
  1135  						go r.collectResourceUsageStats(stopCollection)
  1136  					}
  1138  					handleWaitCh = r.handle.WaitCh()
  1139  				}
  1141  			case waitRes := <-handleWaitCh:
  1142  				if waitRes == nil {
  1143  					panic("nil wait")
  1144  				}
  1146  				r.runningLock.Lock()
  1147  				r.running = false
  1148  				r.runningLock.Unlock()
  1150  				// Stop collection of the task's resource usage
  1151  				close(stopCollection)
  1153  				// Log whether the task was successful or not.
  1154  				r.restartTracker.SetWaitResult(waitRes)
  1155  				r.setState("", r.waitErrorToEvent(waitRes), true)
  1156  				if !waitRes.Successful() {
  1157  					r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes)
  1158  				} else {
  1159  					r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID)
  1160  				}
  1162  				break WAIT
  1163  			case update := <-r.updateCh:
  1164  				if err := r.handleUpdate(update); err != nil {
  1165  					r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err)
  1166  				}
  1168  			case se := <-r.signalCh:
  1169  				r.runningLock.Lock()
  1170  				running := r.running
  1171  				r.runningLock.Unlock()
  1172  				common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID)
  1173  				if !running {
  1174  					// Send no error
  1175  					r.logger.Printf("[DEBUG] client: skipping %s", common)
  1176  					se.result <- nil
  1177  					continue
  1178  				}
  1180  				r.logger.Printf("[DEBUG] client: sending %s", common)
  1181  				r.setState(structs.TaskStateRunning, se.e, false)
  1183  				res := r.handle.Signal(se.s)
  1184  				se.result <- res
  1186  			case restartEvent := <-r.restartCh:
  1187  				r.runningLock.Lock()
  1188  				running := r.running
  1189  				r.runningLock.Unlock()
  1190  				common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID)
  1191  				if !running {
  1192  					r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common)
  1193  					continue
  1194  				}
  1196  				r.logger.Printf("[DEBUG] client: restarting %s: %v", common, restartEvent.taskEvent.RestartReason)
  1197  				r.setState(structs.TaskStateRunning, restartEvent.taskEvent, false)
  1198  				r.killTask(nil)
  1200  				close(stopCollection)
  1202  				if handleWaitCh != nil {
  1203  					<-handleWaitCh
  1204  				}
  1206  				r.restartTracker.SetRestartTriggered(restartEvent.failure)
  1207  				break WAIT
  1209  			case <-r.destroyCh:
  1210  				r.runningLock.Lock()
  1211  				running := r.running
  1212  				r.runningLock.Unlock()
  1213  				if !running {
  1214  					r.cleanup()
  1215  					r.setState(structs.TaskStateDead, r.destroyEvent, false)
  1216  					return
  1217  				}
  1219  				// Remove from consul before killing the task so that traffic
  1220  				// can be rerouted
  1221  				interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1222  				r.consul.RemoveTask(r.alloc.ID, interpTask)
  1224  				// Delay actually killing the task if configured. See #244
  1225  				if r.task.ShutdownDelay > 0 {
  1226  					r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q",
  1227  						r.alloc.ID, r.task.Name, r.task.ShutdownDelay)
  1228  					<-time.After(r.task.ShutdownDelay)
  1229  				}
  1231  				// Store the task event that provides context on the task
  1232  				// destroy. The Killed event is set from the alloc_runner and
  1233  				// doesn't add detail
  1234  				var killEvent *structs.TaskEvent
  1235  				if r.destroyEvent.Type != structs.TaskKilled {
  1236  					if r.destroyEvent.Type == structs.TaskKilling {
  1237  						killEvent = r.destroyEvent
  1238  					} else {
  1239  						r.setState(structs.TaskStateRunning, r.destroyEvent, false)
  1240  					}
  1241  				}
  1243  				r.killTask(killEvent)
  1244  				close(stopCollection)
  1246  				// Wait for handler to exit before calling cleanup
  1247  				<-handleWaitCh
  1248  				r.cleanup()
  1250  				r.setState(structs.TaskStateDead, nil, false)
  1251  				return
  1252  			}
  1253  		}
  1255  	RESTART:
  1256  		// shouldRestart will block if the task should restart after a delay.
  1257  		restart := r.shouldRestart()
  1258  		if !restart {
  1259  			r.cleanup()
  1260  			r.setState(structs.TaskStateDead, nil, false)
  1261  			return
  1262  		}
  1264  		// Clear the handle so a new driver will be created.
  1265  		r.handleLock.Lock()
  1266  		r.handle = nil
  1267  		handleWaitCh = nil
  1268  		stopCollection = nil
  1269  		r.handleLock.Unlock()
  1270  	}
  1271  }
  1273  // cleanup removes Consul entries and calls Driver.Cleanup when a task is
  1274  // stopping. Errors are logged.
  1275  func (r *TaskRunner) cleanup() {
  1276  	// Remove from Consul
  1277  	interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1278  	r.consul.RemoveTask(r.alloc.ID, interpTask)
  1280  	drv, err := r.createDriver()
  1281  	if err != nil {
  1282  		r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err)
  1283  		return
  1284  	}
  1286  	res := r.getCreatedResources()
  1288  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1289  	attempts := 1
  1290  	var cleanupErr error
  1291  	for retry := true; retry; attempts++ {
  1292  		cleanupErr = drv.Cleanup(ctx, res)
  1293  		retry = structs.IsRecoverable(cleanupErr)
  1295  		// Copy current createdResources state in case SaveState is
  1296  		// called between retries
  1297  		r.setCreatedResources(res)
  1299  		// Retry 3 times with sleeps between
  1300  		if !retry || attempts > 3 {
  1301  			break
  1302  		}
  1303  		time.Sleep(time.Duration(attempts) * time.Second)
  1304  	}
  1306  	if cleanupErr != nil {
  1307  		r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr)
  1308  	}
  1309  	return
  1310  }
  1312  // shouldRestart returns if the task should restart. If the return value is
  1313  // true, the task's restart policy has already been considered and any wait time
  1314  // between restarts has been applied.
  1315  func (r *TaskRunner) shouldRestart() bool {
  1316  	state, when := r.restartTracker.GetState()
  1317  	reason := r.restartTracker.GetReason()
  1318  	switch state {
  1319  	case structs.TaskNotRestarting, structs.TaskTerminated:
  1320  		r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
  1321  		if state == structs.TaskNotRestarting {
  1322  			r.setState(structs.TaskStateDead,
  1323  				structs.NewTaskEvent(structs.TaskNotRestarting).
  1324  					SetRestartReason(reason).SetFailsTask(),
  1325  				false)
  1326  		}
  1327  		return false
  1328  	case structs.TaskRestarting:
  1329  		r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
  1330  		r.setState(structs.TaskStatePending,
  1331  			structs.NewTaskEvent(structs.TaskRestarting).
  1332  				SetRestartDelay(when).
  1333  				SetRestartReason(reason),
  1334  			false)
  1335  	default:
  1336  		r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
  1337  		return false
  1338  	}
  1340  	// Unregister from Consul while waiting to restart.
  1341  	interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1342  	r.consul.RemoveTask(r.alloc.ID, interpTask)
  1344  	// Sleep but watch for destroy events.
  1345  	select {
  1346  	case <-time.After(when):
  1347  	case <-r.destroyCh:
  1348  	}
  1350  	// Destroyed while we were waiting to restart, so abort.
  1351  	r.destroyLock.Lock()
  1352  	destroyed := r.destroy
  1353  	r.destroyLock.Unlock()
  1354  	if destroyed {
  1355  		r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name)
  1356  		r.setState(structs.TaskStateDead, r.destroyEvent, false)
  1357  		return false
  1358  	}
  1360  	return true
  1361  }
  1363  // killTask kills the running task. A killing event can optionally be passed and
  1364  // this event is used to mark the task as being killed. It provides a means to
  1365  // store extra information.
  1366  func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) {
  1367  	r.runningLock.Lock()
  1368  	running := r.running
  1369  	r.runningLock.Unlock()
  1370  	if !running {
  1371  		return
  1372  	}
  1374  	// Get the kill timeout
  1375  	timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)
  1377  	// Build the event
  1378  	var event *structs.TaskEvent
  1379  	if killingEvent != nil {
  1380  		event = killingEvent
  1381  		event.Type = structs.TaskKilling
  1382  	} else {
  1383  		event = structs.NewTaskEvent(structs.TaskKilling)
  1384  	}
  1385  	event.SetKillTimeout(timeout)
  1387  	// Mark that we received the kill event
  1388  	r.setState(structs.TaskStateRunning, event, false)
  1390  	handle := r.getHandle()
  1392  	// Kill the task using an exponential backoff in-case of failures.
  1393  	destroySuccess, err := r.handleDestroy(handle)
  1394  	if !destroySuccess {
  1395  		// We couldn't successfully destroy the resource created.
  1396  		r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
  1397  	}
  1399  	r.runningLock.Lock()
  1400  	r.running = false
  1401  	r.runningLock.Unlock()
  1403  	// Store that the task has been destroyed and any associated error.
  1404  	r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err), true)
  1405  }
  1407  // startTask creates the driver, task dir, and starts the task.
  1408  func (r *TaskRunner) startTask() error {
  1409  	// Create a driver
  1410  	drv, err := r.createDriver()
  1411  	if err != nil {
  1412  		return fmt.Errorf("failed to create driver of task %q for alloc %q: %v",
  1413  			r.task.Name, r.alloc.ID, err)
  1414  	}
  1416  	// Run prestart
  1417  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1418  	presp, err := drv.Prestart(ctx, r.task)
  1420  	// Merge newly created resources into previously created resources
  1421  	if presp != nil {
  1422  		r.createdResourcesLock.Lock()
  1423  		r.createdResources.Merge(presp.CreatedResources)
  1424  		r.createdResourcesLock.Unlock()
  1426  		// Set any network configuration returned by the driver
  1427  		r.envBuilder.SetDriverNetwork(presp.Network)
  1428  	}
  1430  	if err != nil {
  1431  		wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v",
  1432  			r.task.Name, r.alloc.ID, err)
  1433  		r.logger.Printf("[WARN] client: error from prestart: %s", wrapped)
  1434  		return structs.WrapRecoverable(wrapped, err)
  1435  	}
  1437  	// Create a new context for Start since the environment may have been updated.
  1438  	ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1440  	// Start the job
  1441  	sresp, err := drv.Start(ctx, r.task)
  1442  	if err != nil {
  1443  		wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v",
  1444  			r.task.Name, r.alloc.ID, err)
  1445  		r.logger.Printf("[WARN] client: %s", wrapped)
  1446  		return structs.WrapRecoverable(wrapped, err)
  1448  	}
  1450  	// Log driver network information
  1451  	if sresp.Network != nil && sresp.Network.IP != "" {
  1452  		if sresp.Network.AutoAdvertise {
  1453  			r.logger.Printf("[INFO] client: alloc %s task %s auto-advertising detected IP %s",
  1454  				r.alloc.ID, r.task.Name, sresp.Network.IP)
  1455  		} else {
  1456  			r.logger.Printf("[TRACE] client: alloc %s task %s detected IP %s but not auto-advertising",
  1457  				r.alloc.ID, r.task.Name, sresp.Network.IP)
  1458  		}
  1459  	}
  1461  	if sresp.Network == nil || sresp.Network.IP == "" {
  1462  		r.logger.Printf("[TRACE] client: alloc %s task %s could not detect a driver IP", r.alloc.ID, r.task.Name)
  1463  	}
  1465  	// Update environment with the network defined by the driver's Start method.
  1466  	r.envBuilder.SetDriverNetwork(sresp.Network)
  1468  	if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil {
  1469  		// All IO is done asynchronously, so errors from registering
  1470  		// services are hard failures.
  1471  		r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err)
  1473  		// Kill the started task
  1474  		if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed {
  1475  			r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v",
  1476  				r.task.Name, r.alloc.ID, err)
  1477  		}
  1478  		return structs.NewRecoverableError(err, false)
  1479  	}
  1481  	r.handleLock.Lock()
  1482  	r.handle = sresp.Handle
  1483  	r.handleLock.Unlock()
  1485  	// Need to persist the driver network between restarts
  1486  	r.driverNetLock.Lock()
  1487  	r.driverNet = sresp.Network
  1488  	r.driverNetLock.Unlock()
  1490  	return nil
  1491  }
  1493  // registerServices and checks with Consul.
  1494  func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error {
  1495  	var exec driver.ScriptExecutor
  1496  	if d.Abilities().Exec {
  1497  		// Allow set the script executor if the driver supports it
  1498  		exec = h
  1499  	}
  1500  	interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
  1501  	return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, r, exec, n)
  1502  }
  1504  // interpolateServices interpolates tags in a service and checks with values from the
  1505  // task's environment.
  1506  func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task {
  1507  	taskCopy := task.Copy()
  1508  	for _, service := range taskCopy.Services {
  1509  		for _, check := range service.Checks {
  1510  			check.Name = taskEnv.ReplaceEnv(check.Name)
  1511  			check.Type = taskEnv.ReplaceEnv(check.Type)
  1512  			check.Command = taskEnv.ReplaceEnv(check.Command)
  1513  			check.Args = taskEnv.ParseAndReplace(check.Args)
  1514  			check.Path = taskEnv.ReplaceEnv(check.Path)
  1515  			check.Protocol = taskEnv.ReplaceEnv(check.Protocol)
  1516  			check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel)
  1517  			check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus)
  1518  			check.Method = taskEnv.ReplaceEnv(check.Method)
  1519  			if len(check.Header) > 0 {
  1520  				header := make(map[string][]string, len(check.Header))
  1521  				for k, vs := range check.Header {
  1522  					newVals := make([]string, len(vs))
  1523  					for i, v := range vs {
  1524  						newVals[i] = taskEnv.ReplaceEnv(v)
  1525  					}
  1526  					header[taskEnv.ReplaceEnv(k)] = newVals
  1527  				}
  1528  				check.Header = header
  1529  			}
  1530  		}
  1531  		service.Name = taskEnv.ReplaceEnv(service.Name)
  1532  		service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel)
  1533  		service.Tags = taskEnv.ParseAndReplace(service.Tags)
  1534  	}
  1535  	return taskCopy
  1536  }
  1538  // buildTaskDir creates the task directory before driver.Prestart. It is safe
  1539  // to call multiple times as its state is persisted.
  1540  func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error {
  1541  	r.persistLock.Lock()
  1542  	built := r.taskDirBuilt
  1543  	r.persistLock.Unlock()
  1545  	// We do not set the state again since this only occurs during restoration
  1546  	// and the task dir is already built. The reason we call Build again is to
  1547  	// ensure that the task dir invariants are still held.
  1548  	if !built {
  1549  		r.setState(structs.TaskStatePending,
  1550  			structs.NewTaskEvent(structs.TaskSetup).SetMessage(structs.TaskBuildingTaskDir),
  1551  			false)
  1552  	}
  1554  	chroot := config.DefaultChrootEnv
  1555  	if len(r.config.ChrootEnv) > 0 {
  1556  		chroot = r.config.ChrootEnv
  1557  	}
  1558  	if err := r.taskDir.Build(built, chroot, fsi); err != nil {
  1559  		return err
  1560  	}
  1562  	// Mark task dir as successfully built
  1563  	r.persistLock.Lock()
  1564  	r.taskDirBuilt = true
  1565  	r.persistLock.Unlock()
  1567  	// Set path and host related env vars
  1568  	driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config)
  1569  	return nil
  1570  }
  1572  // collectResourceUsageStats starts collecting resource usage stats of a Task.
  1573  // Collection ends when the passed channel is closed
  1574  func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) {
  1575  	// start collecting the stats right away and then start collecting every
  1576  	// collection interval
  1577  	next := time.NewTimer(0)
  1578  	defer next.Stop()
  1579  	for {
  1580  		select {
  1581  		case <-next.C:
  1582  			next.Reset(r.config.StatsCollectionInterval)
  1583  			handle := r.getHandle()
  1584  			if handle == nil {
  1585  				continue
  1586  			}
  1587  			ru, err := handle.Stats()
  1589  			if err != nil {
  1590  				// Check if the driver doesn't implement stats
  1591  				if err.Error() == driver.DriverStatsNotImplemented.Error() {
  1592  					r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID)
  1593  					return
  1594  				}
  1596  				// We do not log when the plugin is shutdown as this is simply a
  1597  				// race between the stopCollection channel being closed and calling
  1598  				// Stats on the handle.
  1599  				if !strings.Contains(err.Error(), "connection is shut down") {
  1600  					r.logger.Printf("[DEBUG] client: error fetching stats of task %v: %v", r.task.Name, err)
  1601  				}
  1602  				continue
  1603  			}
  1605  			r.resourceUsageLock.Lock()
  1606  			r.resourceUsage = ru
  1607  			r.resourceUsageLock.Unlock()
  1608  			if ru != nil {
  1609  				r.emitStats(ru)
  1610  			}
  1611  		case <-stopCollection:
  1612  			return
  1613  		}
  1614  	}
  1615  }
  1617  // LatestResourceUsage returns the last resource utilization datapoint collected
  1618  func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1619  	r.resourceUsageLock.RLock()
  1620  	defer r.resourceUsageLock.RUnlock()
  1621  	r.runningLock.Lock()
  1622  	defer r.runningLock.Unlock()
  1624  	// If the task is not running there can be no latest resource
  1625  	if !r.running {
  1626  		return nil
  1627  	}
  1629  	return r.resourceUsage
  1630  }
  1632  // handleUpdate takes an updated allocation and updates internal state to
  1633  // reflect the new config for the task.
  1634  func (r *TaskRunner) handleUpdate(update *structs.Allocation) error {
  1635  	// Extract the task group from the alloc.
  1636  	tg := update.Job.LookupTaskGroup(update.TaskGroup)
  1637  	if tg == nil {
  1638  		return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup)
  1639  	}
  1641  	// Extract the task.
  1642  	var updatedTask *structs.Task
  1643  	for _, t := range tg.Tasks {
  1644  		if t.Name == r.task.Name {
  1645  			updatedTask = t.Copy()
  1646  			break
  1647  		}
  1648  	}
  1649  	if updatedTask == nil {
  1650  		return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name)
  1651  	}
  1653  	// Merge in the task resources
  1654  	updatedTask.Resources = update.TaskResources[updatedTask.Name]
  1656  	// Interpolate the old task with the old env before updating the env as
  1657  	// updating services in Consul need both the old and new interpolations
  1658  	// to find differences.
  1659  	oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
  1661  	// Now it's safe to update the environment
  1662  	r.envBuilder.UpdateTask(update, updatedTask)
  1664  	var mErr multierror.Error
  1665  	r.handleLock.Lock()
  1666  	if r.handle != nil {
  1667  		drv, err := r.createDriver()
  1668  		if err != nil {
  1669  			// Something has really gone wrong; don't continue
  1670  			r.handleLock.Unlock()
  1671  			return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err)
  1672  		}
  1674  		// Update will update resources and store the new kill timeout.
  1675  		if err := r.handle.Update(updatedTask); err != nil {
  1676  			mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err))
  1677  		}
  1679  		// Update services in Consul
  1680  		newInterpolatedTask := interpolateServices(r.envBuilder.Build(), updatedTask)
  1681  		if err := r.updateServices(drv, r.handle, oldInterpolatedTask, newInterpolatedTask); err != nil {
  1682  			mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err))
  1683  		}
  1684  	}
  1685  	r.handleLock.Unlock()
  1687  	// Update the restart policy.
  1688  	if r.restartTracker != nil {
  1689  		r.restartTracker.SetPolicy(tg.RestartPolicy)
  1690  	}
  1692  	// Store the updated alloc.
  1693  	r.alloc = update
  1694  	r.task = updatedTask
  1695  	return mErr.ErrorOrNil()
  1696  }
  1698  // updateServices and checks with Consul. Tasks must be interpolated!
  1699  func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, oldTask, newTask *structs.Task) error {
  1700  	var exec driver.ScriptExecutor
  1701  	if d.Abilities().Exec {
  1702  		// Allow set the script executor if the driver supports it
  1703  		exec = h
  1704  	}
  1705  	r.driverNetLock.Lock()
  1706  	net := r.driverNet.Copy()
  1707  	r.driverNetLock.Unlock()
  1708  	return r.consul.UpdateTask(r.alloc.ID, oldTask, newTask, r, exec, net)
  1709  }
  1711  // handleDestroy kills the task handle. In the case that killing fails,
  1712  // handleDestroy will retry with an exponential backoff and will give up at a
  1713  // given limit. It returns whether the task was destroyed and the error
  1714  // associated with the last kill attempt.
  1715  func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) {
  1716  	// Cap the number of times we attempt to kill the task.
  1717  	for i := 0; i < killFailureLimit; i++ {
  1718  		if err = handle.Kill(); err != nil {
  1719  			// Calculate the new backoff
  1720  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
  1721  			if backoff > killBackoffLimit {
  1722  				backoff = killBackoffLimit
  1723  			}
  1725  			r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v",
  1726  				r.task.Name, r.alloc.ID, backoff, err)
  1727  			time.Sleep(backoff)
  1728  		} else {
  1729  			// Kill was successful
  1730  			return true, nil
  1731  		}
  1732  	}
  1733  	return
  1734  }
  1736  // Restart will restart the task.
  1737  func (r *TaskRunner) Restart(source, reason string, failure bool) {
  1738  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1739  	event := newTaskRestartEvent(reasonStr, failure)
  1741  	select {
  1742  	case r.restartCh <- event:
  1743  	case <-r.waitCh:
  1744  	}
  1745  }
  1747  // Signal will send a signal to the task
  1748  func (r *TaskRunner) Signal(source, reason string, s os.Signal) error {
  1750  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1751  	event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr)
  1753  	resCh := make(chan error)
  1754  	se := SignalEvent{
  1755  		s:      s,
  1756  		e:      event,
  1757  		result: resCh,
  1758  	}
  1760  	select {
  1761  	case r.signalCh <- se:
  1762  	case <-r.waitCh:
  1763  	}
  1765  	return <-resCh
  1766  }
  1768  // Kill will kill a task and store the error, no longer restarting the task. If
  1769  // fail is set, the task is marked as having failed.
  1770  func (r *TaskRunner) Kill(source, reason string, fail bool) {
  1771  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1772  	event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr)
  1773  	if fail {
  1774  		event.SetFailsTask()
  1775  	}
  1777  	r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr)
  1778  	r.Destroy(event)
  1779  }
  1781  func (r *TaskRunner) EmitEvent(source, message string) {
  1782  	event := structs.NewTaskEvent(source).
  1783  		SetMessage(message)
  1784  	r.setState("", event, false)
  1785  	r.logger.Printf("[DEBUG] client: event from %q for task %q in alloc %q: %v",
  1786  		source, r.task.Name, r.alloc.ID, message)
  1787  }
  1789  // UnblockStart unblocks the starting of the task. It currently assumes only
  1790  // consul-template will unblock
  1791  func (r *TaskRunner) UnblockStart(source string) {
  1792  	r.unblockLock.Lock()
  1793  	defer r.unblockLock.Unlock()
  1794  	if r.unblocked {
  1795  		return
  1796  	}
  1798  	r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source)
  1799  	r.unblocked = true
  1800  	close(r.unblockCh)
  1801  }
  1803  // Helper function for converting a WaitResult into a TaskTerminated event.
  1804  func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent {
  1805  	return structs.NewTaskEvent(structs.TaskTerminated).
  1806  		SetExitCode(res.ExitCode).
  1807  		SetSignal(res.Signal).
  1808  		SetExitMessage(res.Err)
  1809  }
  1811  // Update is used to update the task of the context
  1812  func (r *TaskRunner) Update(update *structs.Allocation) {
  1813  	select {
  1814  	case r.updateCh <- update:
  1815  	default:
  1816  		r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
  1817  			r.task.Name, r.alloc.ID)
  1818  	}
  1819  }
  1821  // Destroy is used to indicate that the task context should be destroyed. The
  1822  // event parameter provides a context for the destroy.
  1823  func (r *TaskRunner) Destroy(event *structs.TaskEvent) {
  1824  	r.destroyLock.Lock()
  1825  	defer r.destroyLock.Unlock()
  1827  	if r.destroy {
  1828  		return
  1829  	}
  1830  	r.destroy = true
  1831  	r.destroyEvent = event
  1832  	close(r.destroyCh)
  1833  }
  1835  // getCreatedResources returns the resources created by drivers. It will never
  1836  // return nil.
  1837  func (r *TaskRunner) getCreatedResources() *driver.CreatedResources {
  1838  	r.createdResourcesLock.Lock()
  1839  	if r.createdResources == nil {
  1840  		r.createdResources = driver.NewCreatedResources()
  1841  	}
  1842  	cr := r.createdResources.Copy()
  1843  	r.createdResourcesLock.Unlock()
  1845  	return cr
  1846  }
  1848  // setCreatedResources updates the resources created by drivers. If passed nil
  1849  // it will set createdResources to an initialized struct.
  1850  func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) {
  1851  	if cr == nil {
  1852  		cr = driver.NewCreatedResources()
  1853  	}
  1854  	r.createdResourcesLock.Lock()
  1855  	r.createdResources = cr.Copy()
  1856  	r.createdResourcesLock.Unlock()
  1857  }
  1859  func (r *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) {
  1860  	if !r.config.DisableTaggedMetrics {
  1861  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1862  			float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
  1863  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1864  			float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
  1865  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"},
  1866  			float32(ru.ResourceUsage.MemoryStats.Cache), r.baseLabels)
  1867  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"},
  1868  			float32(ru.ResourceUsage.MemoryStats.Swap), r.baseLabels)
  1869  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"},
  1870  			float32(ru.ResourceUsage.MemoryStats.MaxUsage), r.baseLabels)
  1871  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"},
  1872  			float32(ru.ResourceUsage.MemoryStats.KernelUsage), r.baseLabels)
  1873  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"},
  1874  			float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), r.baseLabels)
  1875  	}
  1877  	if r.config.BackwardsCompatibleMetrics {
  1878  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1879  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1880  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1881  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1882  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1883  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1884  	}
  1885  }
  1887  func (r *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) {
  1888  	if !r.config.DisableTaggedMetrics {
  1889  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"},
  1890  			float32(ru.ResourceUsage.CpuStats.Percent), r.baseLabels)
  1891  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"},
  1892  			float32(ru.ResourceUsage.CpuStats.SystemMode), r.baseLabels)
  1893  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"},
  1894  			float32(ru.ResourceUsage.CpuStats.UserMode), r.baseLabels)
  1895  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"},
  1896  			float32(ru.ResourceUsage.CpuStats.ThrottledTime), r.baseLabels)
  1897  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"},
  1898  			float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), r.baseLabels)
  1899  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"},
  1900  			float32(ru.ResourceUsage.CpuStats.TotalTicks), r.baseLabels)
  1901  	}
  1903  	if r.config.BackwardsCompatibleMetrics {
  1904  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1905  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1906  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1907  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1908  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1909  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1910  	}
  1911  }
  1913  // emitStats emits resource usage stats of tasks to remote metrics collector
  1914  // sinks
  1915  func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1916  	if !r.config.PublishAllocationMetrics {
  1917  		return
  1918  	}
  1920  	// If the task is not running don't emit anything
  1921  	r.runningLock.Lock()
  1922  	running := r.running
  1923  	r.runningLock.Unlock()
  1924  	if !running {
  1925  		return
  1926  	}
  1928  	if ru.ResourceUsage.MemoryStats != nil {
  1929  		r.setGaugeForMemory(ru)
  1930  	}
  1932  	if ru.ResourceUsage.CpuStats != nil {
  1933  		r.setGaugeForCPU(ru)
  1934  	}
  1935  }