github.com/djenriquez/nomad-1@v0.8.1/client/task_runner.go

github.com/djenriquez/nomad-1@v0.8.1/client/task_runner.go (about)

     1  package client
     2  
     3  import (
     4  	"bytes"
     5  	"crypto/md5"
     6  	"encoding/hex"
     7  	"fmt"
     8  	"io"
     9  	"io/ioutil"
    10  	"log"
    11  	"os"
    12  	"path/filepath"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	metrics "github.com/armon/go-metrics"
    18  	"github.com/boltdb/bolt"
    19  	"github.com/golang/snappy"
    20  	"github.com/hashicorp/consul-template/signals"
    21  	"github.com/hashicorp/go-multierror"
    22  	version "github.com/hashicorp/go-version"
    23  	"github.com/hashicorp/nomad/client/allocdir"
    24  	"github.com/hashicorp/nomad/client/config"
    25  	"github.com/hashicorp/nomad/client/driver"
    26  	"github.com/hashicorp/nomad/client/getter"
    27  	"github.com/hashicorp/nomad/client/vaultclient"
    28  	"github.com/hashicorp/nomad/nomad/structs"
    29  	"github.com/ugorji/go/codec"
    30  
    31  	"github.com/hashicorp/nomad/client/driver/env"
    32  	dstructs "github.com/hashicorp/nomad/client/driver/structs"
    33  	cstructs "github.com/hashicorp/nomad/client/structs"
    34  )
    35  
    36  const (
    37  	// killBackoffBaseline is the baseline time for exponential backoff while
    38  	// killing a task.
    39  	killBackoffBaseline = 5 * time.Second
    40  
    41  	// killBackoffLimit is the limit of the exponential backoff for killing
    42  	// the task.
    43  	killBackoffLimit = 2 * time.Minute
    44  
    45  	// killFailureLimit is how many times we will attempt to kill a task before
    46  	// giving up and potentially leaking resources.
    47  	killFailureLimit = 5
    48  
    49  	// vaultBackoffBaseline is the baseline time for exponential backoff when
    50  	// attempting to retrieve a Vault token
    51  	vaultBackoffBaseline = 5 * time.Second
    52  
    53  	// vaultBackoffLimit is the limit of the exponential backoff when attempting
    54  	// to retrieve a Vault token
    55  	vaultBackoffLimit = 3 * time.Minute
    56  
    57  	// vaultTokenFile is the name of the file holding the Vault token inside the
    58  	// task's secret directory
    59  	vaultTokenFile = "vault_token"
    60  )
    61  
    62  var (
    63  	// taskRunnerStateAllKey holds all the task runners state. At the moment
    64  	// there is no need to split it
    65  	taskRunnerStateAllKey = []byte("simple-all")
    66  )
    67  
    68  // taskRestartEvent wraps a TaskEvent with additional metadata to control
    69  // restart behavior.
    70  type taskRestartEvent struct {
    71  	// taskEvent to report
    72  	taskEvent *structs.TaskEvent
    73  
    74  	// if false, don't count against restart count
    75  	failure bool
    76  }
    77  
    78  func newTaskRestartEvent(reason string, failure bool) *taskRestartEvent {
    79  	return &taskRestartEvent{
    80  		taskEvent: structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason),
    81  		failure:   failure,
    82  	}
    83  }
    84  
    85  // TaskRunner is used to wrap a task within an allocation and provide the execution context.
    86  type TaskRunner struct {
    87  	stateDB        *bolt.DB
    88  	config         *config.Config
    89  	updater        TaskStateUpdater
    90  	logger         *log.Logger
    91  	restartTracker *RestartTracker
    92  	consul         ConsulServiceAPI
    93  
    94  	// running marks whether the task is running
    95  	running     bool
    96  	runningLock sync.Mutex
    97  
    98  	resourceUsage     *cstructs.TaskResourceUsage
    99  	resourceUsageLock sync.RWMutex
   100  
   101  	alloc   *structs.Allocation
   102  	task    *structs.Task
   103  	taskDir *allocdir.TaskDir
   104  
   105  	// envBuilder is used to build the task's environment
   106  	envBuilder *env.Builder
   107  
   108  	// driverNet is the network information returned by the driver
   109  	driverNet     *cstructs.DriverNetwork
   110  	driverNetLock sync.Mutex
   111  
   112  	// updateCh is used to receive updated versions of the allocation
   113  	updateCh chan *structs.Allocation
   114  
   115  	handle     driver.DriverHandle
   116  	handleLock sync.Mutex
   117  
   118  	// artifactsDownloaded tracks whether the tasks artifacts have been
   119  	// downloaded
   120  	//
   121  	// Must acquire persistLock when accessing
   122  	artifactsDownloaded bool
   123  
   124  	// taskDirBuilt tracks whether the task has built its directory.
   125  	//
   126  	// Must acquire persistLock when accessing
   127  	taskDirBuilt bool
   128  
   129  	// createdResources are all the resources created by the task driver
   130  	// across all attempts to start the task.
   131  	// Simple gets and sets should use {get,set}CreatedResources
   132  	createdResources     *driver.CreatedResources
   133  	createdResourcesLock sync.Mutex
   134  
   135  	// payloadRendered tracks whether the payload has been rendered to disk
   136  	payloadRendered bool
   137  
   138  	// vaultFuture is the means to wait for and get a Vault token
   139  	vaultFuture *tokenFuture
   140  
   141  	// recoveredVaultToken is the token that was recovered through a restore
   142  	recoveredVaultToken string
   143  
   144  	// vaultClient is used to retrieve and renew any needed Vault token
   145  	vaultClient vaultclient.VaultClient
   146  
   147  	// templateManager is used to manage any consul-templates this task may have
   148  	templateManager *TaskTemplateManager
   149  
   150  	// startCh is used to trigger the start of the task
   151  	startCh chan struct{}
   152  
   153  	// unblockCh is used to unblock the starting of the task
   154  	unblockCh   chan struct{}
   155  	unblocked   bool
   156  	unblockLock sync.Mutex
   157  
   158  	// restartCh is used to restart a task
   159  	restartCh chan *taskRestartEvent
   160  
   161  	// signalCh is used to send a signal to a task
   162  	signalCh chan SignalEvent
   163  
   164  	destroy      bool
   165  	destroyCh    chan struct{}
   166  	destroyLock  sync.Mutex
   167  	destroyEvent *structs.TaskEvent
   168  
   169  	// waitCh closing marks the run loop as having exited
   170  	waitCh chan struct{}
   171  
   172  	// persistLock must be acquired when accessing fields stored by
   173  	// SaveState. SaveState is called asynchronously to TaskRunner.Run by
   174  	// AllocRunner, so all state fields must be synchronized using this
   175  	// lock.
   176  	persistLock sync.Mutex
   177  
   178  	// persistedHash is the hash of the last persisted snapshot. It is used to
   179  	// detect if a new snapshot has to be written to disk.
   180  	persistedHash []byte
   181  
   182  	// baseLabels are used when emitting tagged metrics. All task runner metrics
   183  	// will have these tags, and optionally more.
   184  	baseLabels []metrics.Label
   185  }
   186  
   187  // taskRunnerState is used to snapshot the state of the task runner
   188  type taskRunnerState struct {
   189  	Version            string
   190  	HandleID           string
   191  	ArtifactDownloaded bool
   192  	TaskDirBuilt       bool
   193  	PayloadRendered    bool
   194  	CreatedResources   *driver.CreatedResources
   195  	DriverNetwork      *cstructs.DriverNetwork
   196  }
   197  
   198  func (s *taskRunnerState) Hash() []byte {
   199  	h := md5.New()
   200  
   201  	io.WriteString(h, s.Version)
   202  	io.WriteString(h, s.HandleID)
   203  	io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded))
   204  	io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt))
   205  	io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered))
   206  	h.Write(s.CreatedResources.Hash())
   207  	h.Write(s.DriverNetwork.Hash())
   208  
   209  	return h.Sum(nil)
   210  }
   211  
   212  // TaskStateUpdater is used to signal that tasks state has changed. If lazySync
   213  // is set the event won't be immediately pushed to the server.
   214  type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent, lazySync bool)
   215  
   216  // SignalEvent is a tuple of the signal and the event generating it
   217  type SignalEvent struct {
   218  	// s is the signal to be sent
   219  	s os.Signal
   220  
   221  	// e is the task event generating the signal
   222  	e *structs.TaskEvent
   223  
   224  	// result should be used to send back the result of the signal
   225  	result chan<- error
   226  }
   227  
   228  // NewTaskRunner is used to create a new task context
   229  func NewTaskRunner(logger *log.Logger, config *config.Config,
   230  	stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir,
   231  	alloc *structs.Allocation, task *structs.Task,
   232  	vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner {
   233  
   234  	// Merge in the task resources
   235  	task.Resources = alloc.TaskResources[task.Name]
   236  
   237  	// Build the restart tracker.
   238  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   239  	if tg == nil {
   240  		logger.Printf("[ERR] client: alloc %q for missing task group %q", alloc.ID, alloc.TaskGroup)
   241  		return nil
   242  	}
   243  	restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type)
   244  
   245  	// Initialize the environment builder
   246  	envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region)
   247  
   248  	tc := &TaskRunner{
   249  		config:           config,
   250  		stateDB:          stateDB,
   251  		updater:          updater,
   252  		logger:           logger,
   253  		restartTracker:   restartTracker,
   254  		alloc:            alloc,
   255  		task:             task,
   256  		taskDir:          taskDir,
   257  		envBuilder:       envBuilder,
   258  		createdResources: driver.NewCreatedResources(),
   259  		consul:           consulClient,
   260  		vaultClient:      vaultClient,
   261  		vaultFuture:      NewTokenFuture().Set(""),
   262  		updateCh:         make(chan *structs.Allocation, 64),
   263  		destroyCh:        make(chan struct{}),
   264  		waitCh:           make(chan struct{}),
   265  		startCh:          make(chan struct{}, 1),
   266  		unblockCh:        make(chan struct{}),
   267  		restartCh:        make(chan *taskRestartEvent),
   268  		signalCh:         make(chan SignalEvent),
   269  	}
   270  
   271  	tc.baseLabels = []metrics.Label{
   272  		{
   273  			Name:  "job",
   274  			Value: tc.alloc.Job.Name,
   275  		},
   276  		{
   277  			Name:  "task_group",
   278  			Value: tc.alloc.TaskGroup,
   279  		},
   280  		{
   281  			Name:  "alloc_id",
   282  			Value: tc.alloc.ID,
   283  		},
   284  		{
   285  			Name:  "task",
   286  			Value: tc.task.Name,
   287  		},
   288  	}
   289  
   290  	return tc
   291  }
   292  
   293  // MarkReceived marks the task as received.
   294  func (r *TaskRunner) MarkReceived() {
   295  	// We lazy sync this since there will be a follow up message almost
   296  	// immediately.
   297  	r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived), true)
   298  }
   299  
   300  // WaitCh returns a channel to wait for termination
   301  func (r *TaskRunner) WaitCh() <-chan struct{} {
   302  	return r.waitCh
   303  }
   304  
   305  // getHandle returns the task's handle or nil
   306  func (r *TaskRunner) getHandle() driver.DriverHandle {
   307  	r.handleLock.Lock()
   308  	h := r.handle
   309  	r.handleLock.Unlock()
   310  	return h
   311  }
   312  
   313  // pre060StateFilePath returns the path to our state file that would have been
   314  // written pre v0.6.0
   315  // COMPAT: Remove in 0.7.0
   316  func (r *TaskRunner) pre060StateFilePath() string {
   317  	// Get the MD5 of the task name
   318  	hashVal := md5.Sum([]byte(r.task.Name))
   319  	hashHex := hex.EncodeToString(hashVal[:])
   320  	dirName := fmt.Sprintf("task-%s", hashHex)
   321  
   322  	// Generate the path
   323  	return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json")
   324  }
   325  
   326  // RestoreState is used to restore our state. If a non-empty string is returned
   327  // the task is restarted with the string as the reason. This is useful for
   328  // backwards incompatible upgrades that need to restart tasks with a new
   329  // executor.
   330  func (r *TaskRunner) RestoreState() (string, error) {
   331  	// COMPAT: Remove in 0.7.0
   332  	// 0.6.0 transitioned from individual state files to a single bolt-db.
   333  	// The upgrade path is to:
   334  	// Check if old state exists
   335  	//   If so, restore from that and delete old state
   336  	// Restore using state database
   337  
   338  	var snap taskRunnerState
   339  
   340  	// Check if the old snapshot is there
   341  	oldPath := r.pre060StateFilePath()
   342  	if err := pre060RestoreState(oldPath, &snap); err == nil {
   343  		// Delete the old state
   344  		os.RemoveAll(oldPath)
   345  	} else if !os.IsNotExist(err) {
   346  		// Something corrupt in the old state file
   347  		return "", err
   348  	} else {
   349  		// We are doing a normal restore
   350  		err := r.stateDB.View(func(tx *bolt.Tx) error {
   351  			bkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
   352  			if err != nil {
   353  				return fmt.Errorf("failed to get task bucket: %v", err)
   354  			}
   355  
   356  			if err := getObject(bkt, taskRunnerStateAllKey, &snap); err != nil {
   357  				return fmt.Errorf("failed to read task runner state: %v", err)
   358  			}
   359  			return nil
   360  		})
   361  		if err != nil {
   362  			return "", err
   363  		}
   364  
   365  	}
   366  
   367  	// Restore fields from the snapshot
   368  	r.artifactsDownloaded = snap.ArtifactDownloaded
   369  	r.taskDirBuilt = snap.TaskDirBuilt
   370  	r.payloadRendered = snap.PayloadRendered
   371  	r.setCreatedResources(snap.CreatedResources)
   372  	r.driverNet = snap.DriverNetwork
   373  
   374  	if r.task.Vault != nil {
   375  		// Read the token from the secret directory
   376  		tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   377  		data, err := ioutil.ReadFile(tokenPath)
   378  		if err != nil {
   379  			if !os.IsNotExist(err) {
   380  				return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   381  			}
   382  
   383  			// Token file doesn't exist
   384  		} else {
   385  			// Store the recovered token
   386  			r.recoveredVaultToken = string(data)
   387  		}
   388  	}
   389  
   390  	// Restore the driver
   391  	restartReason := ""
   392  	if snap.HandleID != "" {
   393  		d, err := r.createDriver()
   394  		if err != nil {
   395  			return "", err
   396  		}
   397  
   398  		// Add the restored network driver to the environment
   399  		r.envBuilder.SetDriverNetwork(r.driverNet)
   400  
   401  		// Open a connection to the driver handle
   402  		ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
   403  		handle, err := d.Open(ctx, snap.HandleID)
   404  
   405  		// In the case it fails, we relaunch the task in the Run() method.
   406  		if err != nil {
   407  			r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v",
   408  				r.task.Name, r.alloc.ID, err)
   409  			return "", nil
   410  		}
   411  
   412  		if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) {
   413  			restartReason = pre06ScriptCheckReason
   414  		}
   415  
   416  		if err := r.registerServices(d, handle, r.driverNet); err != nil {
   417  			// Don't hard fail here as there's a chance this task
   418  			// registered with Consul properly when it initial
   419  			// started.
   420  			r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v",
   421  				r.task.Name, r.alloc.ID, err)
   422  		}
   423  
   424  		r.handleLock.Lock()
   425  		r.handle = handle
   426  		r.handleLock.Unlock()
   427  
   428  		r.runningLock.Lock()
   429  		r.running = true
   430  		r.runningLock.Unlock()
   431  	}
   432  	return restartReason, nil
   433  }
   434  
   435  // ver06 is used for checking for pre-0.6 script checks
   436  var ver06 = version.Must(version.NewVersion("0.6.0dev"))
   437  
   438  // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script
   439  // check is found on an exec/java task.
   440  const pre06ScriptCheckReason = "upgrading pre-0.6 script checks"
   441  
   442  // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script
   443  // check, and uses exec or java drivers.
   444  func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool {
   445  	if driver != "exec" && driver != "java" && driver != "mock_driver" {
   446  		// Only exec and java are affected
   447  		return false
   448  	}
   449  	v, err := version.NewVersion(ver)
   450  	if err != nil {
   451  		// Treat it as old
   452  		return true
   453  	}
   454  	if !v.LessThan(ver06) {
   455  		// >= 0.6.0dev
   456  		return false
   457  	}
   458  	for _, service := range services {
   459  		for _, check := range service.Checks {
   460  			if check.Type == "script" {
   461  				return true
   462  			}
   463  		}
   464  	}
   465  	return false
   466  }
   467  
   468  // SaveState is used to snapshot our state
   469  func (r *TaskRunner) SaveState() error {
   470  	r.destroyLock.Lock()
   471  	defer r.destroyLock.Unlock()
   472  	if r.destroy {
   473  		// Don't save state if already destroyed
   474  		return nil
   475  	}
   476  
   477  	r.persistLock.Lock()
   478  	defer r.persistLock.Unlock()
   479  	snap := taskRunnerState{
   480  		Version:            r.config.Version.VersionNumber(),
   481  		ArtifactDownloaded: r.artifactsDownloaded,
   482  		TaskDirBuilt:       r.taskDirBuilt,
   483  		PayloadRendered:    r.payloadRendered,
   484  		CreatedResources:   r.getCreatedResources(),
   485  	}
   486  
   487  	r.handleLock.Lock()
   488  	if r.handle != nil {
   489  		snap.HandleID = r.handle.ID()
   490  	}
   491  	r.handleLock.Unlock()
   492  
   493  	r.driverNetLock.Lock()
   494  	snap.DriverNetwork = r.driverNet.Copy()
   495  	r.driverNetLock.Unlock()
   496  
   497  	// If nothing has changed avoid the write
   498  	h := snap.Hash()
   499  	if bytes.Equal(h, r.persistedHash) {
   500  		return nil
   501  	}
   502  
   503  	// Serialize the object
   504  	var buf bytes.Buffer
   505  	if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil {
   506  		return fmt.Errorf("failed to serialize snapshot: %v", err)
   507  	}
   508  
   509  	// Start the transaction.
   510  	return r.stateDB.Batch(func(tx *bolt.Tx) error {
   511  		// Grab the task bucket
   512  		taskBkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name)
   513  		if err != nil {
   514  			return fmt.Errorf("failed to retrieve allocation bucket: %v", err)
   515  		}
   516  
   517  		if err := putData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil {
   518  			return fmt.Errorf("failed to write task_runner state: %v", err)
   519  		}
   520  
   521  		// Store the hash that was persisted
   522  		tx.OnCommit(func() {
   523  			r.persistedHash = h
   524  		})
   525  
   526  		return nil
   527  	})
   528  }
   529  
   530  // DestroyState is used to cleanup after ourselves
   531  func (r *TaskRunner) DestroyState() error {
   532  	r.persistLock.Lock()
   533  	defer r.persistLock.Unlock()
   534  
   535  	return r.stateDB.Update(func(tx *bolt.Tx) error {
   536  		if err := deleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil {
   537  			return fmt.Errorf("failed to delete task bucket: %v", err)
   538  		}
   539  		return nil
   540  	})
   541  }
   542  
   543  // setState is used to update the state of the task runner
   544  func (r *TaskRunner) setState(state string, event *structs.TaskEvent, lazySync bool) {
   545  	event.PopulateEventDisplayMessage()
   546  
   547  	// Persist our state to disk.
   548  	if err := r.SaveState(); err != nil {
   549  		r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err)
   550  	}
   551  
   552  	// Indicate the task has been updated.
   553  	r.updater(r.task.Name, state, event, lazySync)
   554  }
   555  
   556  // createDriver makes a driver for the task
   557  func (r *TaskRunner) createDriver() (driver.Driver, error) {
   558  	// Create a task-specific event emitter callback to expose minimal
   559  	// state to drivers
   560  	eventEmitter := func(m string, args ...interface{}) {
   561  		msg := fmt.Sprintf(m, args...)
   562  		r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg)
   563  		r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg), false)
   564  	}
   565  
   566  	driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter)
   567  	d, err := driver.NewDriver(r.task.Driver, driverCtx)
   568  	if err != nil {
   569  		return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
   570  			r.task.Driver, r.alloc.ID, err)
   571  	}
   572  
   573  	return d, err
   574  }
   575  
   576  // Run is a long running routine used to manage the task
   577  func (r *TaskRunner) Run() {
   578  	defer close(r.waitCh)
   579  	r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')",
   580  		r.task.Name, r.alloc.ID)
   581  
   582  	if err := r.validateTask(); err != nil {
   583  		r.setState(
   584  			structs.TaskStateDead,
   585  			structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask(),
   586  			false)
   587  		return
   588  	}
   589  
   590  	// Create a temporary driver so that we can determine the FSIsolation
   591  	// required. run->startTask will create a new driver after environment
   592  	// has been setup (env vars, templates, artifacts, secrets, etc).
   593  	tmpDrv, err := r.createDriver()
   594  	if err != nil {
   595  		e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   596  		r.setState(
   597  			structs.TaskStateDead,
   598  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
   599  			false)
   600  		return
   601  	}
   602  
   603  	// Build base task directory structure regardless of FS isolation abilities.
   604  	// This needs to happen before we start the Vault manager and call prestart
   605  	// as both those can write to the task directories
   606  	if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil {
   607  		e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err)
   608  		r.setState(
   609  			structs.TaskStateDead,
   610  			structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(),
   611  			false)
   612  		return
   613  	}
   614  
   615  	// If there is no Vault policy leave the static future created in
   616  	// NewTaskRunner
   617  	if r.task.Vault != nil {
   618  		// Start the go-routine to get a Vault token
   619  		r.vaultFuture.Clear()
   620  		go r.vaultManager(r.recoveredVaultToken)
   621  	}
   622  
   623  	// Start the run loop
   624  	r.run()
   625  
   626  	// Do any cleanup necessary
   627  	r.postrun()
   628  
   629  	return
   630  }
   631  
   632  // validateTask validates the fields of the task and returns an error if the
   633  // task is invalid.
   634  func (r *TaskRunner) validateTask() error {
   635  	var mErr multierror.Error
   636  
   637  	// Validate the user.
   638  	unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist)
   639  	checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers)
   640  	if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch {
   641  		if _, unallowed := unallowedUsers[r.task.User]; unallowed {
   642  			mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User))
   643  		}
   644  	}
   645  
   646  	// Validate the artifacts
   647  	for i, artifact := range r.task.Artifacts {
   648  		// Verify the artifact doesn't escape the task directory.
   649  		if err := artifact.Validate(); err != nil {
   650  			// If this error occurs there is potentially a server bug or
   651  			// malicious, server spoofing.
   652  			r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v",
   653  				r.alloc.ID, r.task.Name, artifact, i, err)
   654  			mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err))
   655  		}
   656  	}
   657  
   658  	// Validate the Service names
   659  	taskEnv := r.envBuilder.Build()
   660  	for i, service := range r.task.Services {
   661  		name := taskEnv.ReplaceEnv(service.Name)
   662  		if err := service.ValidateName(name); err != nil {
   663  			mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err))
   664  		}
   665  	}
   666  
   667  	if len(mErr.Errors) == 1 {
   668  		return mErr.Errors[0]
   669  	}
   670  	return mErr.ErrorOrNil()
   671  }
   672  
   673  // tokenFuture stores the Vault token and allows consumers to block till a valid
   674  // token exists
   675  type tokenFuture struct {
   676  	waiting []chan struct{}
   677  	token   string
   678  	set     bool
   679  	m       sync.Mutex
   680  }
   681  
   682  // NewTokenFuture returns a new token future without any token set
   683  func NewTokenFuture() *tokenFuture {
   684  	return &tokenFuture{}
   685  }
   686  
   687  // Wait returns a channel that can be waited on. When this channel unblocks, a
   688  // valid token will be available via the Get method
   689  func (f *tokenFuture) Wait() <-chan struct{} {
   690  	f.m.Lock()
   691  	defer f.m.Unlock()
   692  
   693  	c := make(chan struct{})
   694  	if f.set {
   695  		close(c)
   696  		return c
   697  	}
   698  
   699  	f.waiting = append(f.waiting, c)
   700  	return c
   701  }
   702  
   703  // Set sets the token value and unblocks any caller of Wait
   704  func (f *tokenFuture) Set(token string) *tokenFuture {
   705  	f.m.Lock()
   706  	defer f.m.Unlock()
   707  
   708  	f.set = true
   709  	f.token = token
   710  	for _, w := range f.waiting {
   711  		close(w)
   712  	}
   713  	f.waiting = nil
   714  	return f
   715  }
   716  
   717  // Clear clears the set vault token.
   718  func (f *tokenFuture) Clear() *tokenFuture {
   719  	f.m.Lock()
   720  	defer f.m.Unlock()
   721  
   722  	f.token = ""
   723  	f.set = false
   724  	return f
   725  }
   726  
   727  // Get returns the set Vault token
   728  func (f *tokenFuture) Get() string {
   729  	f.m.Lock()
   730  	defer f.m.Unlock()
   731  	return f.token
   732  }
   733  
   734  // vaultManager should be called in a go-routine and manages the derivation,
   735  // renewal and handling of errors with the Vault token. The optional parameter
   736  // allows setting the initial Vault token. This is useful when the Vault token
   737  // is recovered off disk.
   738  func (r *TaskRunner) vaultManager(token string) {
   739  	// Helper for stopping token renewal
   740  	stopRenewal := func() {
   741  		if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil {
   742  			r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   743  		}
   744  	}
   745  
   746  	// updatedToken lets us store state between loops. If true, a new token
   747  	// has been retrieved and we need to apply the Vault change mode
   748  	var updatedToken bool
   749  
   750  OUTER:
   751  	for {
   752  		// Check if we should exit
   753  		select {
   754  		case <-r.waitCh:
   755  			stopRenewal()
   756  			return
   757  		default:
   758  		}
   759  
   760  		// Clear the token
   761  		r.vaultFuture.Clear()
   762  
   763  		// Check if there already is a token which can be the case for
   764  		// restoring the TaskRunner
   765  		if token == "" {
   766  			// Get a token
   767  			var exit bool
   768  			token, exit = r.deriveVaultToken()
   769  			if exit {
   770  				// Exit the manager
   771  				return
   772  			}
   773  
   774  			// Write the token to disk
   775  			if err := r.writeToken(token); err != nil {
   776  				e := fmt.Errorf("failed to write Vault token to disk")
   777  				r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err)
   778  				r.Kill("vault", e.Error(), true)
   779  				return
   780  			}
   781  		}
   782  
   783  		// Start the renewal process
   784  		renewCh, err := r.vaultClient.RenewToken(token, 30)
   785  
   786  		// An error returned means the token is not being renewed
   787  		if err != nil {
   788  			r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   789  			token = ""
   790  			goto OUTER
   791  		}
   792  
   793  		// The Vault token is valid now, so set it
   794  		r.vaultFuture.Set(token)
   795  
   796  		if updatedToken {
   797  			switch r.task.Vault.ChangeMode {
   798  			case structs.VaultChangeModeSignal:
   799  				s, err := signals.Parse(r.task.Vault.ChangeSignal)
   800  				if err != nil {
   801  					e := fmt.Errorf("failed to parse signal: %v", err)
   802  					r.logger.Printf("[ERR] client: %v", err)
   803  					r.Kill("vault", e.Error(), true)
   804  					return
   805  				}
   806  
   807  				if err := r.Signal("vault", "new Vault token acquired", s); err != nil {
   808  					r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err)
   809  					r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true)
   810  					return
   811  				}
   812  			case structs.VaultChangeModeRestart:
   813  				const noFailure = false
   814  				r.Restart("vault", "new Vault token acquired", noFailure)
   815  			case structs.VaultChangeModeNoop:
   816  				fallthrough
   817  			default:
   818  				r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode)
   819  			}
   820  
   821  			// We have handled it
   822  			updatedToken = false
   823  
   824  			// Call the handler
   825  			r.updatedTokenHandler()
   826  		}
   827  
   828  		// Start watching for renewal errors
   829  		select {
   830  		case err := <-renewCh:
   831  			// Clear the token
   832  			token = ""
   833  			r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err)
   834  			stopRenewal()
   835  
   836  			// Check if we have to do anything
   837  			if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop {
   838  				updatedToken = true
   839  			}
   840  		case <-r.waitCh:
   841  			stopRenewal()
   842  			return
   843  		}
   844  	}
   845  }
   846  
   847  // deriveVaultToken derives the Vault token using exponential backoffs. It
   848  // returns the Vault token and whether the manager should exit.
   849  func (r *TaskRunner) deriveVaultToken() (token string, exit bool) {
   850  	attempts := 0
   851  	for {
   852  		tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name})
   853  		if err == nil {
   854  			return tokens[r.task.Name], false
   855  		}
   856  
   857  		// Check if this is a server side error
   858  		if structs.IsServerSide(err) {
   859  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   860  				r.task.Name, r.alloc.ID, err)
   861  			r.Kill("vault", fmt.Sprintf("server error deriving vault token: %v", err), true)
   862  			return "", true
   863  		}
   864  		// Check if we can't recover from the error
   865  		if !structs.IsRecoverable(err) {
   866  			r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v",
   867  				r.task.Name, r.alloc.ID, err)
   868  			r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true)
   869  			return "", true
   870  		}
   871  
   872  		// Handle the retry case
   873  		backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
   874  		if backoff > vaultBackoffLimit {
   875  			backoff = vaultBackoffLimit
   876  		}
   877  		r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v",
   878  			r.task.Name, r.alloc.ID, err, backoff)
   879  
   880  		attempts++
   881  
   882  		// Wait till retrying
   883  		select {
   884  		case <-r.waitCh:
   885  			return "", true
   886  		case <-time.After(backoff):
   887  		}
   888  	}
   889  }
   890  
   891  // writeToken writes the given token to disk
   892  func (r *TaskRunner) writeToken(token string) error {
   893  	tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile)
   894  	if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil {
   895  		return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err)
   896  	}
   897  
   898  	return nil
   899  }
   900  
   901  // updatedTokenHandler is called when a new Vault token is retrieved. Things
   902  // that rely on the token should be updated here.
   903  func (r *TaskRunner) updatedTokenHandler() {
   904  
   905  	// Update the tasks environment
   906  	r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env)
   907  
   908  	if r.templateManager != nil {
   909  		r.templateManager.Stop()
   910  
   911  		// Create a new templateManager
   912  		var err error
   913  		r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
   914  			Hooks:                r,
   915  			Templates:            r.task.Templates,
   916  			ClientConfig:         r.config,
   917  			VaultToken:           r.vaultFuture.Get(),
   918  			TaskDir:              r.taskDir.Dir,
   919  			EnvBuilder:           r.envBuilder,
   920  			MaxTemplateEventRate: DefaultMaxTemplateEventRate,
   921  		})
   922  
   923  		if err != nil {
   924  			err := fmt.Errorf("failed to build task's template manager: %v", err)
   925  			r.setState(structs.TaskStateDead,
   926  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   927  				false)
   928  			r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err)
   929  			r.Kill("vault", err.Error(), true)
   930  			return
   931  		}
   932  	}
   933  }
   934  
   935  // prestart handles life-cycle tasks that occur before the task has started.
   936  // Since it's run asynchronously with the main Run() loop the alloc & task are
   937  // passed in to avoid racing with updates.
   938  func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) {
   939  	if task.Vault != nil {
   940  		// Wait for the token
   941  		r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID)
   942  		tokenCh := r.vaultFuture.Wait()
   943  		select {
   944  		case <-tokenCh:
   945  		case <-r.waitCh:
   946  			resultCh <- false
   947  			return
   948  		}
   949  		r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID)
   950  		r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env)
   951  	}
   952  
   953  	// If the job is a dispatch job and there is a payload write it to disk
   954  	requirePayload := len(alloc.Job.Payload) != 0 &&
   955  		(r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "")
   956  	if !r.payloadRendered && requirePayload {
   957  		renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File)
   958  		decoded, err := snappy.Decode(nil, alloc.Job.Payload)
   959  		if err != nil {
   960  			r.setState(
   961  				structs.TaskStateDead,
   962  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   963  				false)
   964  			resultCh <- false
   965  			return
   966  		}
   967  
   968  		if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil {
   969  			r.setState(
   970  				structs.TaskStateDead,
   971  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   972  				false)
   973  			resultCh <- false
   974  			return
   975  		}
   976  
   977  		if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil {
   978  			r.setState(
   979  				structs.TaskStateDead,
   980  				structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(),
   981  				false)
   982  			resultCh <- false
   983  			return
   984  		}
   985  
   986  		r.payloadRendered = true
   987  	}
   988  
   989  	for {
   990  		r.persistLock.Lock()
   991  		downloaded := r.artifactsDownloaded
   992  		r.persistLock.Unlock()
   993  
   994  		// Download the task's artifacts
   995  		if !downloaded && len(task.Artifacts) > 0 {
   996  			r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts), false)
   997  			taskEnv := r.envBuilder.Build()
   998  			for _, artifact := range task.Artifacts {
   999  				if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil {
  1000  					wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err)
  1001  					r.logger.Printf("[DEBUG] client: %v", wrapped)
  1002  					r.setState(structs.TaskStatePending,
  1003  						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped), false)
  1004  					r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err))
  1005  					goto RESTART
  1006  				}
  1007  			}
  1008  
  1009  			r.persistLock.Lock()
  1010  			r.artifactsDownloaded = true
  1011  			r.persistLock.Unlock()
  1012  		}
  1013  
  1014  		// We don't have to wait for any template
  1015  		if len(task.Templates) == 0 {
  1016  			// Send the start signal
  1017  			select {
  1018  			case r.startCh <- struct{}{}:
  1019  			default:
  1020  			}
  1021  
  1022  			resultCh <- true
  1023  			return
  1024  		}
  1025  
  1026  		// Build the template manager
  1027  		if r.templateManager == nil {
  1028  			var err error
  1029  			r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{
  1030  				Hooks:                r,
  1031  				Templates:            r.task.Templates,
  1032  				ClientConfig:         r.config,
  1033  				VaultToken:           r.vaultFuture.Get(),
  1034  				TaskDir:              r.taskDir.Dir,
  1035  				EnvBuilder:           r.envBuilder,
  1036  				MaxTemplateEventRate: DefaultMaxTemplateEventRate,
  1037  			})
  1038  			if err != nil {
  1039  				err := fmt.Errorf("failed to build task's template manager: %v", err)
  1040  				r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), false)
  1041  				r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err)
  1042  				resultCh <- false
  1043  				return
  1044  			}
  1045  		}
  1046  
  1047  		// Block for consul-template
  1048  		// TODO Hooks should register themselves as blocking and then we can
  1049  		// periodically enumerate what we are still blocked on
  1050  		select {
  1051  		case <-r.unblockCh:
  1052  			// Send the start signal
  1053  			select {
  1054  			case r.startCh <- struct{}{}:
  1055  			default:
  1056  			}
  1057  
  1058  			resultCh <- true
  1059  			return
  1060  		case <-r.waitCh:
  1061  			// The run loop has exited so exit too
  1062  			resultCh <- false
  1063  			return
  1064  		}
  1065  
  1066  	RESTART:
  1067  		restart := r.shouldRestart()
  1068  		if !restart {
  1069  			resultCh <- false
  1070  			return
  1071  		}
  1072  	}
  1073  }
  1074  
  1075  // postrun is used to do any cleanup that is necessary after exiting the runloop
  1076  func (r *TaskRunner) postrun() {
  1077  	// Stop the template manager
  1078  	if r.templateManager != nil {
  1079  		r.templateManager.Stop()
  1080  	}
  1081  }
  1082  
  1083  // run is the main run loop that handles starting the application, destroying
  1084  // it, restarts and signals.
  1085  func (r *TaskRunner) run() {
  1086  	// Predeclare things so we can jump to the RESTART
  1087  	var stopCollection chan struct{}
  1088  	var handleWaitCh chan *dstructs.WaitResult
  1089  
  1090  	// If we already have a handle, populate the stopCollection and handleWaitCh
  1091  	// to fix the invariant that it exists.
  1092  	handleEmpty := r.getHandle() == nil
  1093  
  1094  	if !handleEmpty {
  1095  		stopCollection = make(chan struct{})
  1096  		go r.collectResourceUsageStats(stopCollection)
  1097  		handleWaitCh = r.handle.WaitCh()
  1098  	}
  1099  
  1100  	for {
  1101  		// Do the prestart activities
  1102  		prestartResultCh := make(chan bool, 1)
  1103  		go r.prestart(r.alloc, r.task, prestartResultCh)
  1104  
  1105  	WAIT:
  1106  		for {
  1107  			select {
  1108  			case success := <-prestartResultCh:
  1109  				if !success {
  1110  					r.cleanup()
  1111  					r.setState(structs.TaskStateDead, nil, false)
  1112  					return
  1113  				}
  1114  			case <-r.startCh:
  1115  				// Start the task if not yet started or it is being forced. This logic
  1116  				// is necessary because in the case of a restore the handle already
  1117  				// exists.
  1118  				handleEmpty := r.getHandle() == nil
  1119  				if handleEmpty {
  1120  					startErr := r.startTask()
  1121  					r.restartTracker.SetStartError(startErr)
  1122  					if startErr != nil {
  1123  						r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr), true)
  1124  						goto RESTART
  1125  					}
  1126  
  1127  					// Mark the task as started
  1128  					r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted), false)
  1129  					r.runningLock.Lock()
  1130  					r.running = true
  1131  					r.runningLock.Unlock()
  1132  
  1133  					if stopCollection == nil {
  1134  						stopCollection = make(chan struct{})
  1135  						go r.collectResourceUsageStats(stopCollection)
  1136  					}
  1137  
  1138  					handleWaitCh = r.handle.WaitCh()
  1139  				}
  1140  
  1141  			case waitRes := <-handleWaitCh:
  1142  				if waitRes == nil {
  1143  					panic("nil wait")
  1144  				}
  1145  
  1146  				r.runningLock.Lock()
  1147  				r.running = false
  1148  				r.runningLock.Unlock()
  1149  
  1150  				// Stop collection of the task's resource usage
  1151  				close(stopCollection)
  1152  
  1153  				// Log whether the task was successful or not.
  1154  				r.restartTracker.SetWaitResult(waitRes)
  1155  				r.setState("", r.waitErrorToEvent(waitRes), true)
  1156  				if !waitRes.Successful() {
  1157  					r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes)
  1158  				} else {
  1159  					r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID)
  1160  				}
  1161  
  1162  				break WAIT
  1163  			case update := <-r.updateCh:
  1164  				if err := r.handleUpdate(update); err != nil {
  1165  					r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err)
  1166  				}
  1167  
  1168  			case se := <-r.signalCh:
  1169  				r.runningLock.Lock()
  1170  				running := r.running
  1171  				r.runningLock.Unlock()
  1172  				common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID)
  1173  				if !running {
  1174  					// Send no error
  1175  					r.logger.Printf("[DEBUG] client: skipping %s", common)
  1176  					se.result <- nil
  1177  					continue
  1178  				}
  1179  
  1180  				r.logger.Printf("[DEBUG] client: sending %s", common)
  1181  				r.setState(structs.TaskStateRunning, se.e, false)
  1182  
  1183  				res := r.handle.Signal(se.s)
  1184  				se.result <- res
  1185  
  1186  			case restartEvent := <-r.restartCh:
  1187  				r.runningLock.Lock()
  1188  				running := r.running
  1189  				r.runningLock.Unlock()
  1190  				common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID)
  1191  				if !running {
  1192  					r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common)
  1193  					continue
  1194  				}
  1195  
  1196  				r.logger.Printf("[DEBUG] client: restarting %s: %v", common, restartEvent.taskEvent.RestartReason)
  1197  				r.setState(structs.TaskStateRunning, restartEvent.taskEvent, false)
  1198  				r.killTask(nil)
  1199  
  1200  				close(stopCollection)
  1201  
  1202  				if handleWaitCh != nil {
  1203  					<-handleWaitCh
  1204  				}
  1205  
  1206  				r.restartTracker.SetRestartTriggered(restartEvent.failure)
  1207  				break WAIT
  1208  
  1209  			case <-r.destroyCh:
  1210  				r.runningLock.Lock()
  1211  				running := r.running
  1212  				r.runningLock.Unlock()
  1213  				if !running {
  1214  					r.cleanup()
  1215  					r.setState(structs.TaskStateDead, r.destroyEvent, false)
  1216  					return
  1217  				}
  1218  
  1219  				// Remove from consul before killing the task so that traffic
  1220  				// can be rerouted
  1221  				interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1222  				r.consul.RemoveTask(r.alloc.ID, interpTask)
  1223  
  1224  				// Delay actually killing the task if configured. See #244
  1225  				if r.task.ShutdownDelay > 0 {
  1226  					r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q",
  1227  						r.alloc.ID, r.task.Name, r.task.ShutdownDelay)
  1228  					<-time.After(r.task.ShutdownDelay)
  1229  				}
  1230  
  1231  				// Store the task event that provides context on the task
  1232  				// destroy. The Killed event is set from the alloc_runner and
  1233  				// doesn't add detail
  1234  				var killEvent *structs.TaskEvent
  1235  				if r.destroyEvent.Type != structs.TaskKilled {
  1236  					if r.destroyEvent.Type == structs.TaskKilling {
  1237  						killEvent = r.destroyEvent
  1238  					} else {
  1239  						r.setState(structs.TaskStateRunning, r.destroyEvent, false)
  1240  					}
  1241  				}
  1242  
  1243  				r.killTask(killEvent)
  1244  				close(stopCollection)
  1245  
  1246  				// Wait for handler to exit before calling cleanup
  1247  				<-handleWaitCh
  1248  				r.cleanup()
  1249  
  1250  				r.setState(structs.TaskStateDead, nil, false)
  1251  				return
  1252  			}
  1253  		}
  1254  
  1255  	RESTART:
  1256  		// shouldRestart will block if the task should restart after a delay.
  1257  		restart := r.shouldRestart()
  1258  		if !restart {
  1259  			r.cleanup()
  1260  			r.setState(structs.TaskStateDead, nil, false)
  1261  			return
  1262  		}
  1263  
  1264  		// Clear the handle so a new driver will be created.
  1265  		r.handleLock.Lock()
  1266  		r.handle = nil
  1267  		handleWaitCh = nil
  1268  		stopCollection = nil
  1269  		r.handleLock.Unlock()
  1270  	}
  1271  }
  1272  
  1273  // cleanup removes Consul entries and calls Driver.Cleanup when a task is
  1274  // stopping. Errors are logged.
  1275  func (r *TaskRunner) cleanup() {
  1276  	// Remove from Consul
  1277  	interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1278  	r.consul.RemoveTask(r.alloc.ID, interpTask)
  1279  
  1280  	drv, err := r.createDriver()
  1281  	if err != nil {
  1282  		r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err)
  1283  		return
  1284  	}
  1285  
  1286  	res := r.getCreatedResources()
  1287  
  1288  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1289  	attempts := 1
  1290  	var cleanupErr error
  1291  	for retry := true; retry; attempts++ {
  1292  		cleanupErr = drv.Cleanup(ctx, res)
  1293  		retry = structs.IsRecoverable(cleanupErr)
  1294  
  1295  		// Copy current createdResources state in case SaveState is
  1296  		// called between retries
  1297  		r.setCreatedResources(res)
  1298  
  1299  		// Retry 3 times with sleeps between
  1300  		if !retry || attempts > 3 {
  1301  			break
  1302  		}
  1303  		time.Sleep(time.Duration(attempts) * time.Second)
  1304  	}
  1305  
  1306  	if cleanupErr != nil {
  1307  		r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr)
  1308  	}
  1309  	return
  1310  }
  1311  
  1312  // shouldRestart returns if the task should restart. If the return value is
  1313  // true, the task's restart policy has already been considered and any wait time
  1314  // between restarts has been applied.
  1315  func (r *TaskRunner) shouldRestart() bool {
  1316  	state, when := r.restartTracker.GetState()
  1317  	reason := r.restartTracker.GetReason()
  1318  	switch state {
  1319  	case structs.TaskNotRestarting, structs.TaskTerminated:
  1320  		r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID)
  1321  		if state == structs.TaskNotRestarting {
  1322  			r.setState(structs.TaskStateDead,
  1323  				structs.NewTaskEvent(structs.TaskNotRestarting).
  1324  					SetRestartReason(reason).SetFailsTask(),
  1325  				false)
  1326  		}
  1327  		return false
  1328  	case structs.TaskRestarting:
  1329  		r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when)
  1330  		r.setState(structs.TaskStatePending,
  1331  			structs.NewTaskEvent(structs.TaskRestarting).
  1332  				SetRestartDelay(when).
  1333  				SetRestartReason(reason),
  1334  			false)
  1335  	default:
  1336  		r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state)
  1337  		return false
  1338  	}
  1339  
  1340  	// Unregister from Consul while waiting to restart.
  1341  	interpTask := interpolateServices(r.envBuilder.Build(), r.task)
  1342  	r.consul.RemoveTask(r.alloc.ID, interpTask)
  1343  
  1344  	// Sleep but watch for destroy events.
  1345  	select {
  1346  	case <-time.After(when):
  1347  	case <-r.destroyCh:
  1348  	}
  1349  
  1350  	// Destroyed while we were waiting to restart, so abort.
  1351  	r.destroyLock.Lock()
  1352  	destroyed := r.destroy
  1353  	r.destroyLock.Unlock()
  1354  	if destroyed {
  1355  		r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name)
  1356  		r.setState(structs.TaskStateDead, r.destroyEvent, false)
  1357  		return false
  1358  	}
  1359  
  1360  	return true
  1361  }
  1362  
  1363  // killTask kills the running task. A killing event can optionally be passed and
  1364  // this event is used to mark the task as being killed. It provides a means to
  1365  // store extra information.
  1366  func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) {
  1367  	r.runningLock.Lock()
  1368  	running := r.running
  1369  	r.runningLock.Unlock()
  1370  	if !running {
  1371  		return
  1372  	}
  1373  
  1374  	// Get the kill timeout
  1375  	timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout)
  1376  
  1377  	// Build the event
  1378  	var event *structs.TaskEvent
  1379  	if killingEvent != nil {
  1380  		event = killingEvent
  1381  		event.Type = structs.TaskKilling
  1382  	} else {
  1383  		event = structs.NewTaskEvent(structs.TaskKilling)
  1384  	}
  1385  	event.SetKillTimeout(timeout)
  1386  
  1387  	// Mark that we received the kill event
  1388  	r.setState(structs.TaskStateRunning, event, false)
  1389  
  1390  	handle := r.getHandle()
  1391  
  1392  	// Kill the task using an exponential backoff in-case of failures.
  1393  	destroySuccess, err := r.handleDestroy(handle)
  1394  	if !destroySuccess {
  1395  		// We couldn't successfully destroy the resource created.
  1396  		r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err)
  1397  	}
  1398  
  1399  	r.runningLock.Lock()
  1400  	r.running = false
  1401  	r.runningLock.Unlock()
  1402  
  1403  	// Store that the task has been destroyed and any associated error.
  1404  	r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err), true)
  1405  }
  1406  
  1407  // startTask creates the driver, task dir, and starts the task.
  1408  func (r *TaskRunner) startTask() error {
  1409  	// Create a driver
  1410  	drv, err := r.createDriver()
  1411  	if err != nil {
  1412  		return fmt.Errorf("failed to create driver of task %q for alloc %q: %v",
  1413  			r.task.Name, r.alloc.ID, err)
  1414  	}
  1415  
  1416  	// Run prestart
  1417  	ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1418  	presp, err := drv.Prestart(ctx, r.task)
  1419  
  1420  	// Merge newly created resources into previously created resources
  1421  	if presp != nil {
  1422  		r.createdResourcesLock.Lock()
  1423  		r.createdResources.Merge(presp.CreatedResources)
  1424  		r.createdResourcesLock.Unlock()
  1425  
  1426  		// Set any network configuration returned by the driver
  1427  		r.envBuilder.SetDriverNetwork(presp.Network)
  1428  	}
  1429  
  1430  	if err != nil {
  1431  		wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v",
  1432  			r.task.Name, r.alloc.ID, err)
  1433  		r.logger.Printf("[WARN] client: error from prestart: %s", wrapped)
  1434  		return structs.WrapRecoverable(wrapped, err)
  1435  	}
  1436  
  1437  	// Create a new context for Start since the environment may have been updated.
  1438  	ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build())
  1439  
  1440  	// Start the job
  1441  	sresp, err := drv.Start(ctx, r.task)
  1442  	if err != nil {
  1443  		wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v",
  1444  			r.task.Name, r.alloc.ID, err)
  1445  		r.logger.Printf("[WARN] client: %s", wrapped)
  1446  		return structs.WrapRecoverable(wrapped, err)
  1447  
  1448  	}
  1449  
  1450  	// Log driver network information
  1451  	if sresp.Network != nil && sresp.Network.IP != "" {
  1452  		if sresp.Network.AutoAdvertise {
  1453  			r.logger.Printf("[INFO] client: alloc %s task %s auto-advertising detected IP %s",
  1454  				r.alloc.ID, r.task.Name, sresp.Network.IP)
  1455  		} else {
  1456  			r.logger.Printf("[TRACE] client: alloc %s task %s detected IP %s but not auto-advertising",
  1457  				r.alloc.ID, r.task.Name, sresp.Network.IP)
  1458  		}
  1459  	}
  1460  
  1461  	if sresp.Network == nil || sresp.Network.IP == "" {
  1462  		r.logger.Printf("[TRACE] client: alloc %s task %s could not detect a driver IP", r.alloc.ID, r.task.Name)
  1463  	}
  1464  
  1465  	// Update environment with the network defined by the driver's Start method.
  1466  	r.envBuilder.SetDriverNetwork(sresp.Network)
  1467  
  1468  	if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil {
  1469  		// All IO is done asynchronously, so errors from registering
  1470  		// services are hard failures.
  1471  		r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err)
  1472  
  1473  		// Kill the started task
  1474  		if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed {
  1475  			r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v",
  1476  				r.task.Name, r.alloc.ID, err)
  1477  		}
  1478  		return structs.NewRecoverableError(err, false)
  1479  	}
  1480  
  1481  	r.handleLock.Lock()
  1482  	r.handle = sresp.Handle
  1483  	r.handleLock.Unlock()
  1484  
  1485  	// Need to persist the driver network between restarts
  1486  	r.driverNetLock.Lock()
  1487  	r.driverNet = sresp.Network
  1488  	r.driverNetLock.Unlock()
  1489  
  1490  	return nil
  1491  }
  1492  
  1493  // registerServices and checks with Consul.
  1494  func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error {
  1495  	var exec driver.ScriptExecutor
  1496  	if d.Abilities().Exec {
  1497  		// Allow set the script executor if the driver supports it
  1498  		exec = h
  1499  	}
  1500  	interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
  1501  	return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, r, exec, n)
  1502  }
  1503  
  1504  // interpolateServices interpolates tags in a service and checks with values from the
  1505  // task's environment.
  1506  func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task {
  1507  	taskCopy := task.Copy()
  1508  	for _, service := range taskCopy.Services {
  1509  		for _, check := range service.Checks {
  1510  			check.Name = taskEnv.ReplaceEnv(check.Name)
  1511  			check.Type = taskEnv.ReplaceEnv(check.Type)
  1512  			check.Command = taskEnv.ReplaceEnv(check.Command)
  1513  			check.Args = taskEnv.ParseAndReplace(check.Args)
  1514  			check.Path = taskEnv.ReplaceEnv(check.Path)
  1515  			check.Protocol = taskEnv.ReplaceEnv(check.Protocol)
  1516  			check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel)
  1517  			check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus)
  1518  			check.Method = taskEnv.ReplaceEnv(check.Method)
  1519  			if len(check.Header) > 0 {
  1520  				header := make(map[string][]string, len(check.Header))
  1521  				for k, vs := range check.Header {
  1522  					newVals := make([]string, len(vs))
  1523  					for i, v := range vs {
  1524  						newVals[i] = taskEnv.ReplaceEnv(v)
  1525  					}
  1526  					header[taskEnv.ReplaceEnv(k)] = newVals
  1527  				}
  1528  				check.Header = header
  1529  			}
  1530  		}
  1531  		service.Name = taskEnv.ReplaceEnv(service.Name)
  1532  		service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel)
  1533  		service.Tags = taskEnv.ParseAndReplace(service.Tags)
  1534  	}
  1535  	return taskCopy
  1536  }
  1537  
  1538  // buildTaskDir creates the task directory before driver.Prestart. It is safe
  1539  // to call multiple times as its state is persisted.
  1540  func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error {
  1541  	r.persistLock.Lock()
  1542  	built := r.taskDirBuilt
  1543  	r.persistLock.Unlock()
  1544  
  1545  	// We do not set the state again since this only occurs during restoration
  1546  	// and the task dir is already built. The reason we call Build again is to
  1547  	// ensure that the task dir invariants are still held.
  1548  	if !built {
  1549  		r.setState(structs.TaskStatePending,
  1550  			structs.NewTaskEvent(structs.TaskSetup).SetMessage(structs.TaskBuildingTaskDir),
  1551  			false)
  1552  	}
  1553  
  1554  	chroot := config.DefaultChrootEnv
  1555  	if len(r.config.ChrootEnv) > 0 {
  1556  		chroot = r.config.ChrootEnv
  1557  	}
  1558  	if err := r.taskDir.Build(built, chroot, fsi); err != nil {
  1559  		return err
  1560  	}
  1561  
  1562  	// Mark task dir as successfully built
  1563  	r.persistLock.Lock()
  1564  	r.taskDirBuilt = true
  1565  	r.persistLock.Unlock()
  1566  
  1567  	// Set path and host related env vars
  1568  	driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config)
  1569  	return nil
  1570  }
  1571  
  1572  // collectResourceUsageStats starts collecting resource usage stats of a Task.
  1573  // Collection ends when the passed channel is closed
  1574  func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) {
  1575  	// start collecting the stats right away and then start collecting every
  1576  	// collection interval
  1577  	next := time.NewTimer(0)
  1578  	defer next.Stop()
  1579  	for {
  1580  		select {
  1581  		case <-next.C:
  1582  			next.Reset(r.config.StatsCollectionInterval)
  1583  			handle := r.getHandle()
  1584  			if handle == nil {
  1585  				continue
  1586  			}
  1587  			ru, err := handle.Stats()
  1588  
  1589  			if err != nil {
  1590  				// Check if the driver doesn't implement stats
  1591  				if err.Error() == driver.DriverStatsNotImplemented.Error() {
  1592  					r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID)
  1593  					return
  1594  				}
  1595  
  1596  				// We do not log when the plugin is shutdown as this is simply a
  1597  				// race between the stopCollection channel being closed and calling
  1598  				// Stats on the handle.
  1599  				if !strings.Contains(err.Error(), "connection is shut down") {
  1600  					r.logger.Printf("[DEBUG] client: error fetching stats of task %v: %v", r.task.Name, err)
  1601  				}
  1602  				continue
  1603  			}
  1604  
  1605  			r.resourceUsageLock.Lock()
  1606  			r.resourceUsage = ru
  1607  			r.resourceUsageLock.Unlock()
  1608  			if ru != nil {
  1609  				r.emitStats(ru)
  1610  			}
  1611  		case <-stopCollection:
  1612  			return
  1613  		}
  1614  	}
  1615  }
  1616  
  1617  // LatestResourceUsage returns the last resource utilization datapoint collected
  1618  func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage {
  1619  	r.resourceUsageLock.RLock()
  1620  	defer r.resourceUsageLock.RUnlock()
  1621  	r.runningLock.Lock()
  1622  	defer r.runningLock.Unlock()
  1623  
  1624  	// If the task is not running there can be no latest resource
  1625  	if !r.running {
  1626  		return nil
  1627  	}
  1628  
  1629  	return r.resourceUsage
  1630  }
  1631  
  1632  // handleUpdate takes an updated allocation and updates internal state to
  1633  // reflect the new config for the task.
  1634  func (r *TaskRunner) handleUpdate(update *structs.Allocation) error {
  1635  	// Extract the task group from the alloc.
  1636  	tg := update.Job.LookupTaskGroup(update.TaskGroup)
  1637  	if tg == nil {
  1638  		return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup)
  1639  	}
  1640  
  1641  	// Extract the task.
  1642  	var updatedTask *structs.Task
  1643  	for _, t := range tg.Tasks {
  1644  		if t.Name == r.task.Name {
  1645  			updatedTask = t.Copy()
  1646  			break
  1647  		}
  1648  	}
  1649  	if updatedTask == nil {
  1650  		return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name)
  1651  	}
  1652  
  1653  	// Merge in the task resources
  1654  	updatedTask.Resources = update.TaskResources[updatedTask.Name]
  1655  
  1656  	// Interpolate the old task with the old env before updating the env as
  1657  	// updating services in Consul need both the old and new interpolations
  1658  	// to find differences.
  1659  	oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), r.task)
  1660  
  1661  	// Now it's safe to update the environment
  1662  	r.envBuilder.UpdateTask(update, updatedTask)
  1663  
  1664  	var mErr multierror.Error
  1665  	r.handleLock.Lock()
  1666  	if r.handle != nil {
  1667  		drv, err := r.createDriver()
  1668  		if err != nil {
  1669  			// Something has really gone wrong; don't continue
  1670  			r.handleLock.Unlock()
  1671  			return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err)
  1672  		}
  1673  
  1674  		// Update will update resources and store the new kill timeout.
  1675  		if err := r.handle.Update(updatedTask); err != nil {
  1676  			mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err))
  1677  		}
  1678  
  1679  		// Update services in Consul
  1680  		newInterpolatedTask := interpolateServices(r.envBuilder.Build(), updatedTask)
  1681  		if err := r.updateServices(drv, r.handle, oldInterpolatedTask, newInterpolatedTask); err != nil {
  1682  			mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err))
  1683  		}
  1684  	}
  1685  	r.handleLock.Unlock()
  1686  
  1687  	// Update the restart policy.
  1688  	if r.restartTracker != nil {
  1689  		r.restartTracker.SetPolicy(tg.RestartPolicy)
  1690  	}
  1691  
  1692  	// Store the updated alloc.
  1693  	r.alloc = update
  1694  	r.task = updatedTask
  1695  	return mErr.ErrorOrNil()
  1696  }
  1697  
  1698  // updateServices and checks with Consul. Tasks must be interpolated!
  1699  func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, oldTask, newTask *structs.Task) error {
  1700  	var exec driver.ScriptExecutor
  1701  	if d.Abilities().Exec {
  1702  		// Allow set the script executor if the driver supports it
  1703  		exec = h
  1704  	}
  1705  	r.driverNetLock.Lock()
  1706  	net := r.driverNet.Copy()
  1707  	r.driverNetLock.Unlock()
  1708  	return r.consul.UpdateTask(r.alloc.ID, oldTask, newTask, r, exec, net)
  1709  }
  1710  
  1711  // handleDestroy kills the task handle. In the case that killing fails,
  1712  // handleDestroy will retry with an exponential backoff and will give up at a
  1713  // given limit. It returns whether the task was destroyed and the error
  1714  // associated with the last kill attempt.
  1715  func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) {
  1716  	// Cap the number of times we attempt to kill the task.
  1717  	for i := 0; i < killFailureLimit; i++ {
  1718  		if err = handle.Kill(); err != nil {
  1719  			// Calculate the new backoff
  1720  			backoff := (1 << (2 * uint64(i))) * killBackoffBaseline
  1721  			if backoff > killBackoffLimit {
  1722  				backoff = killBackoffLimit
  1723  			}
  1724  
  1725  			r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v",
  1726  				r.task.Name, r.alloc.ID, backoff, err)
  1727  			time.Sleep(backoff)
  1728  		} else {
  1729  			// Kill was successful
  1730  			return true, nil
  1731  		}
  1732  	}
  1733  	return
  1734  }
  1735  
  1736  // Restart will restart the task.
  1737  func (r *TaskRunner) Restart(source, reason string, failure bool) {
  1738  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1739  	event := newTaskRestartEvent(reasonStr, failure)
  1740  
  1741  	select {
  1742  	case r.restartCh <- event:
  1743  	case <-r.waitCh:
  1744  	}
  1745  }
  1746  
  1747  // Signal will send a signal to the task
  1748  func (r *TaskRunner) Signal(source, reason string, s os.Signal) error {
  1749  
  1750  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1751  	event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr)
  1752  
  1753  	resCh := make(chan error)
  1754  	se := SignalEvent{
  1755  		s:      s,
  1756  		e:      event,
  1757  		result: resCh,
  1758  	}
  1759  
  1760  	select {
  1761  	case r.signalCh <- se:
  1762  	case <-r.waitCh:
  1763  	}
  1764  
  1765  	return <-resCh
  1766  }
  1767  
  1768  // Kill will kill a task and store the error, no longer restarting the task. If
  1769  // fail is set, the task is marked as having failed.
  1770  func (r *TaskRunner) Kill(source, reason string, fail bool) {
  1771  	reasonStr := fmt.Sprintf("%s: %s", source, reason)
  1772  	event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr)
  1773  	if fail {
  1774  		event.SetFailsTask()
  1775  	}
  1776  
  1777  	r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr)
  1778  	r.Destroy(event)
  1779  }
  1780  
  1781  func (r *TaskRunner) EmitEvent(source, message string) {
  1782  	event := structs.NewTaskEvent(source).
  1783  		SetMessage(message)
  1784  	r.setState("", event, false)
  1785  	r.logger.Printf("[DEBUG] client: event from %q for task %q in alloc %q: %v",
  1786  		source, r.task.Name, r.alloc.ID, message)
  1787  }
  1788  
  1789  // UnblockStart unblocks the starting of the task. It currently assumes only
  1790  // consul-template will unblock
  1791  func (r *TaskRunner) UnblockStart(source string) {
  1792  	r.unblockLock.Lock()
  1793  	defer r.unblockLock.Unlock()
  1794  	if r.unblocked {
  1795  		return
  1796  	}
  1797  
  1798  	r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source)
  1799  	r.unblocked = true
  1800  	close(r.unblockCh)
  1801  }
  1802  
  1803  // Helper function for converting a WaitResult into a TaskTerminated event.
  1804  func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent {
  1805  	return structs.NewTaskEvent(structs.TaskTerminated).
  1806  		SetExitCode(res.ExitCode).
  1807  		SetSignal(res.Signal).
  1808  		SetExitMessage(res.Err)
  1809  }
  1810  
  1811  // Update is used to update the task of the context
  1812  func (r *TaskRunner) Update(update *structs.Allocation) {
  1813  	select {
  1814  	case r.updateCh <- update:
  1815  	default:
  1816  		r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')",
  1817  			r.task.Name, r.alloc.ID)
  1818  	}
  1819  }
  1820  
  1821  // Destroy is used to indicate that the task context should be destroyed. The
  1822  // event parameter provides a context for the destroy.
  1823  func (r *TaskRunner) Destroy(event *structs.TaskEvent) {
  1824  	r.destroyLock.Lock()
  1825  	defer r.destroyLock.Unlock()
  1826  
  1827  	if r.destroy {
  1828  		return
  1829  	}
  1830  	r.destroy = true
  1831  	r.destroyEvent = event
  1832  	close(r.destroyCh)
  1833  }
  1834  
  1835  // getCreatedResources returns the resources created by drivers. It will never
  1836  // return nil.
  1837  func (r *TaskRunner) getCreatedResources() *driver.CreatedResources {
  1838  	r.createdResourcesLock.Lock()
  1839  	if r.createdResources == nil {
  1840  		r.createdResources = driver.NewCreatedResources()
  1841  	}
  1842  	cr := r.createdResources.Copy()
  1843  	r.createdResourcesLock.Unlock()
  1844  
  1845  	return cr
  1846  }
  1847  
  1848  // setCreatedResources updates the resources created by drivers. If passed nil
  1849  // it will set createdResources to an initialized struct.
  1850  func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) {
  1851  	if cr == nil {
  1852  		cr = driver.NewCreatedResources()
  1853  	}
  1854  	r.createdResourcesLock.Lock()
  1855  	r.createdResources = cr.Copy()
  1856  	r.createdResourcesLock.Unlock()
  1857  }
  1858  
  1859  func (r *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) {
  1860  	if !r.config.DisableTaggedMetrics {
  1861  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1862  			float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
  1863  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"},
  1864  			float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels)
  1865  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"},
  1866  			float32(ru.ResourceUsage.MemoryStats.Cache), r.baseLabels)
  1867  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"},
  1868  			float32(ru.ResourceUsage.MemoryStats.Swap), r.baseLabels)
  1869  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"},
  1870  			float32(ru.ResourceUsage.MemoryStats.MaxUsage), r.baseLabels)
  1871  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"},
  1872  			float32(ru.ResourceUsage.MemoryStats.KernelUsage), r.baseLabels)
  1873  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"},
  1874  			float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), r.baseLabels)
  1875  	}
  1876  
  1877  	if r.config.BackwardsCompatibleMetrics {
  1878  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS))
  1879  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache))
  1880  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap))
  1881  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage))
  1882  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage))
  1883  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage))
  1884  	}
  1885  }
  1886  
  1887  func (r *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) {
  1888  	if !r.config.DisableTaggedMetrics {
  1889  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"},
  1890  			float32(ru.ResourceUsage.CpuStats.Percent), r.baseLabels)
  1891  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"},
  1892  			float32(ru.ResourceUsage.CpuStats.SystemMode), r.baseLabels)
  1893  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"},
  1894  			float32(ru.ResourceUsage.CpuStats.UserMode), r.baseLabels)
  1895  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"},
  1896  			float32(ru.ResourceUsage.CpuStats.ThrottledTime), r.baseLabels)
  1897  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"},
  1898  			float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), r.baseLabels)
  1899  		metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"},
  1900  			float32(ru.ResourceUsage.CpuStats.TotalTicks), r.baseLabels)
  1901  	}
  1902  
  1903  	if r.config.BackwardsCompatibleMetrics {
  1904  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent))
  1905  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode))
  1906  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode))
  1907  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime))
  1908  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods))
  1909  		metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks))
  1910  	}
  1911  }
  1912  
  1913  // emitStats emits resource usage stats of tasks to remote metrics collector
  1914  // sinks
  1915  func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
  1916  	if !r.config.PublishAllocationMetrics {
  1917  		return
  1918  	}
  1919  
  1920  	// If the task is not running don't emit anything
  1921  	r.runningLock.Lock()
  1922  	running := r.running
  1923  	r.runningLock.Unlock()
  1924  	if !running {
  1925  		return
  1926  	}
  1927  
  1928  	if ru.ResourceUsage.MemoryStats != nil {
  1929  		r.setGaugeForMemory(ru)
  1930  	}
  1931  
  1932  	if ru.ResourceUsage.CpuStats != nil {
  1933  		r.setGaugeForCPU(ru)
  1934  	}
  1935  }