github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/alloc_runner.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  	multierror "github.com/hashicorp/go-multierror"
    11  	"github.com/hashicorp/nomad/client/allocdir"
    12  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    13  	"github.com/hashicorp/nomad/client/allocrunner/state"
    14  	"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
    15  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
    16  	"github.com/hashicorp/nomad/client/allocwatcher"
    17  	"github.com/hashicorp/nomad/client/config"
    18  	"github.com/hashicorp/nomad/client/consul"
    19  	"github.com/hashicorp/nomad/client/devicemanager"
    20  	"github.com/hashicorp/nomad/client/dynamicplugins"
    21  	cinterfaces "github.com/hashicorp/nomad/client/interfaces"
    22  	"github.com/hashicorp/nomad/client/lib/cgutil"
    23  	"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
    24  	"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
    25  	"github.com/hashicorp/nomad/client/serviceregistration"
    26  	"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
    27  	"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
    28  	cstate "github.com/hashicorp/nomad/client/state"
    29  	cstructs "github.com/hashicorp/nomad/client/structs"
    30  	"github.com/hashicorp/nomad/client/vaultclient"
    31  	"github.com/hashicorp/nomad/helper/pointer"
    32  	"github.com/hashicorp/nomad/nomad/structs"
    33  	"github.com/hashicorp/nomad/plugins/device"
    34  	"github.com/hashicorp/nomad/plugins/drivers"
    35  )
    36  
    37  // allocRunner is used to run all the tasks in a given allocation
    38  type allocRunner struct {
    39  	// id is the ID of the allocation. Can be accessed without a lock
    40  	id string
    41  
    42  	// Logger is the logger for the alloc runner.
    43  	logger log.Logger
    44  
    45  	// clientConfig is the client configuration block.
    46  	clientConfig *config.Config
    47  
    48  	// stateUpdater is used to emit updated alloc state
    49  	stateUpdater cinterfaces.AllocStateHandler
    50  
    51  	// taskStateUpdatedCh is ticked whenever task state as changed. Must
    52  	// have len==1 to allow nonblocking notification of state updates while
    53  	// the goroutine is already processing a previous update.
    54  	taskStateUpdatedCh chan struct{}
    55  
    56  	// taskStateUpdateHandlerCh is closed when the task state handling
    57  	// goroutine exits. It is unsafe to destroy the local allocation state
    58  	// before this goroutine exits.
    59  	taskStateUpdateHandlerCh chan struct{}
    60  
    61  	// allocUpdatedCh is a channel that is used to stream allocation updates into
    62  	// the allocUpdate handler. Must have len==1 to allow nonblocking notification
    63  	// of new allocation updates while the goroutine is processing a previous
    64  	// update.
    65  	allocUpdatedCh chan *structs.Allocation
    66  
    67  	// consulClient is the client used by the consul service hook for
    68  	// registering services and checks
    69  	consulClient serviceregistration.Handler
    70  
    71  	// consulProxiesClient is the client used by the envoy version hook for
    72  	// looking up supported envoy versions of the consul agent.
    73  	consulProxiesClient consul.SupportedProxiesAPI
    74  
    75  	// sidsClient is the client used by the service identity hook for
    76  	// managing SI tokens
    77  	sidsClient consul.ServiceIdentityAPI
    78  
    79  	// vaultClient is the used to manage Vault tokens
    80  	vaultClient vaultclient.VaultClient
    81  
    82  	// waitCh is closed when the Run loop has exited
    83  	waitCh chan struct{}
    84  
    85  	// destroyed is true when the Run loop has exited, postrun hooks have
    86  	// run, and alloc runner has been destroyed. Must acquire destroyedLock
    87  	// to access.
    88  	destroyed bool
    89  
    90  	// destroyCh is closed when the Run loop has exited, postrun hooks have
    91  	// run, and alloc runner has been destroyed.
    92  	destroyCh chan struct{}
    93  
    94  	// shutdown is true when the Run loop has exited, and shutdown hooks have
    95  	// run. Must acquire destroyedLock to access.
    96  	shutdown bool
    97  
    98  	// shutdownCh is closed when the Run loop has exited, and shutdown hooks
    99  	// have run.
   100  	shutdownCh chan struct{}
   101  
   102  	// destroyLaunched is true if Destroy has been called. Must acquire
   103  	// destroyedLock to access.
   104  	destroyLaunched bool
   105  
   106  	// shutdownLaunched is true if Shutdown has been called. Must acquire
   107  	// destroyedLock to access.
   108  	shutdownLaunched bool
   109  
   110  	// destroyedLock guards destroyed, destroyLaunched, shutdownLaunched,
   111  	// and serializes Shutdown/Destroy calls.
   112  	destroyedLock sync.Mutex
   113  
   114  	// Alloc captures the allocation being run.
   115  	alloc     *structs.Allocation
   116  	allocLock sync.RWMutex
   117  
   118  	// state is the alloc runner's state
   119  	state     *state.State
   120  	stateLock sync.RWMutex
   121  
   122  	stateDB cstate.StateDB
   123  
   124  	// allocDir is used to build the allocations directory structure.
   125  	allocDir *allocdir.AllocDir
   126  
   127  	// runnerHooks are alloc runner lifecycle hooks that should be run on state
   128  	// transistions.
   129  	runnerHooks []interfaces.RunnerHook
   130  
   131  	// hookState is the output of allocrunner hooks
   132  	hookState   *cstructs.AllocHookResources
   133  	hookStateMu sync.RWMutex
   134  
   135  	// tasks are the set of task runners
   136  	tasks map[string]*taskrunner.TaskRunner
   137  
   138  	// deviceStatsReporter is used to lookup resource usage for alloc devices
   139  	deviceStatsReporter cinterfaces.DeviceStatsReporter
   140  
   141  	// allocBroadcaster sends client allocation updates to all listeners
   142  	allocBroadcaster *cstructs.AllocBroadcaster
   143  
   144  	// prevAllocWatcher allows waiting for any previous or preempted allocations
   145  	// to exit
   146  	prevAllocWatcher allocwatcher.PrevAllocWatcher
   147  
   148  	// prevAllocMigrator allows the migration of a previous allocations alloc dir.
   149  	prevAllocMigrator allocwatcher.PrevAllocMigrator
   150  
   151  	// dynamicRegistry contains all locally registered dynamic plugins (e.g csi
   152  	// plugins).
   153  	dynamicRegistry dynamicplugins.Registry
   154  
   155  	// csiManager is used to wait for CSI Volumes to be attached, and by the task
   156  	// runner to manage their mounting
   157  	csiManager csimanager.Manager
   158  
   159  	// cpusetManager is responsible for configuring task cgroups if supported by the platform
   160  	cpusetManager cgutil.CpusetManager
   161  
   162  	// devicemanager is used to mount devices as well as lookup device
   163  	// statistics
   164  	devicemanager devicemanager.Manager
   165  
   166  	// driverManager is responsible for dispensing driver plugins and registering
   167  	// event handlers
   168  	driverManager drivermanager.Manager
   169  
   170  	// serversContactedCh is passed to TaskRunners so they can detect when
   171  	// servers have been contacted for the first time in case of a failed
   172  	// restore.
   173  	serversContactedCh chan struct{}
   174  
   175  	// taskCoordinator is used to controlled when tasks are allowed to run
   176  	// depending on their lifecycle configuration.
   177  	taskCoordinator *tasklifecycle.Coordinator
   178  
   179  	shutdownDelayCtx      context.Context
   180  	shutdownDelayCancelFn context.CancelFunc
   181  
   182  	// rpcClient is the RPC Client that should be used by the allocrunner and its
   183  	// hooks to communicate with Nomad Servers.
   184  	rpcClient RPCer
   185  
   186  	// serviceRegWrapper is the handler wrapper that is used by service hooks
   187  	// to perform service and check registration and deregistration.
   188  	serviceRegWrapper *wrapper.HandlerWrapper
   189  
   190  	// checkStore contains check status information
   191  	checkStore checkstore.Shim
   192  
   193  	// getter is an interface for retrieving artifacts.
   194  	getter cinterfaces.ArtifactGetter
   195  }
   196  
   197  // RPCer is the interface needed by hooks to make RPC calls.
   198  type RPCer interface {
   199  	RPC(method string, args interface{}, reply interface{}) error
   200  }
   201  
   202  // NewAllocRunner returns a new allocation runner.
   203  func NewAllocRunner(config *Config) (*allocRunner, error) {
   204  	alloc := config.Alloc
   205  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   206  	if tg == nil {
   207  		return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup)
   208  	}
   209  
   210  	ar := &allocRunner{
   211  		id:                       alloc.ID,
   212  		alloc:                    alloc,
   213  		clientConfig:             config.ClientConfig,
   214  		consulClient:             config.Consul,
   215  		consulProxiesClient:      config.ConsulProxies,
   216  		sidsClient:               config.ConsulSI,
   217  		vaultClient:              config.Vault,
   218  		tasks:                    make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)),
   219  		waitCh:                   make(chan struct{}),
   220  		destroyCh:                make(chan struct{}),
   221  		shutdownCh:               make(chan struct{}),
   222  		state:                    &state.State{},
   223  		stateDB:                  config.StateDB,
   224  		stateUpdater:             config.StateUpdater,
   225  		taskStateUpdatedCh:       make(chan struct{}, 1),
   226  		taskStateUpdateHandlerCh: make(chan struct{}),
   227  		allocUpdatedCh:           make(chan *structs.Allocation, 1),
   228  		deviceStatsReporter:      config.DeviceStatsReporter,
   229  		prevAllocWatcher:         config.PrevAllocWatcher,
   230  		prevAllocMigrator:        config.PrevAllocMigrator,
   231  		dynamicRegistry:          config.DynamicRegistry,
   232  		csiManager:               config.CSIManager,
   233  		cpusetManager:            config.CpusetManager,
   234  		devicemanager:            config.DeviceManager,
   235  		driverManager:            config.DriverManager,
   236  		serversContactedCh:       config.ServersContactedCh,
   237  		rpcClient:                config.RPCClient,
   238  		serviceRegWrapper:        config.ServiceRegWrapper,
   239  		checkStore:               config.CheckStore,
   240  		getter:                   config.Getter,
   241  	}
   242  
   243  	// Create the logger based on the allocation ID
   244  	ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID)
   245  
   246  	// Create alloc broadcaster
   247  	ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger)
   248  
   249  	// Create alloc dir
   250  	ar.allocDir = allocdir.NewAllocDir(ar.logger, config.ClientConfig.AllocDir, alloc.ID)
   251  
   252  	ar.taskCoordinator = tasklifecycle.NewCoordinator(ar.logger, tg.Tasks, ar.waitCh)
   253  
   254  	shutdownDelayCtx, shutdownDelayCancel := context.WithCancel(context.Background())
   255  	ar.shutdownDelayCtx = shutdownDelayCtx
   256  	ar.shutdownDelayCancelFn = shutdownDelayCancel
   257  
   258  	// Initialize the runners hooks.
   259  	if err := ar.initRunnerHooks(config.ClientConfig); err != nil {
   260  		return nil, err
   261  	}
   262  
   263  	// Create the TaskRunners
   264  	if err := ar.initTaskRunners(tg.Tasks); err != nil {
   265  		return nil, err
   266  	}
   267  
   268  	return ar, nil
   269  }
   270  
   271  // initTaskRunners creates task runners but does *not* run them.
   272  func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
   273  	for _, task := range tasks {
   274  		trConfig := &taskrunner.Config{
   275  			Alloc:               ar.alloc,
   276  			ClientConfig:        ar.clientConfig,
   277  			Task:                task,
   278  			TaskDir:             ar.allocDir.NewTaskDir(task.Name),
   279  			Logger:              ar.logger,
   280  			StateDB:             ar.stateDB,
   281  			StateUpdater:        ar,
   282  			DynamicRegistry:     ar.dynamicRegistry,
   283  			Consul:              ar.consulClient,
   284  			ConsulProxies:       ar.consulProxiesClient,
   285  			ConsulSI:            ar.sidsClient,
   286  			Vault:               ar.vaultClient,
   287  			DeviceStatsReporter: ar.deviceStatsReporter,
   288  			CSIManager:          ar.csiManager,
   289  			DeviceManager:       ar.devicemanager,
   290  			DriverManager:       ar.driverManager,
   291  			ServersContactedCh:  ar.serversContactedCh,
   292  			StartConditionMetCh: ar.taskCoordinator.StartConditionForTask(task),
   293  			ShutdownDelayCtx:    ar.shutdownDelayCtx,
   294  			ServiceRegWrapper:   ar.serviceRegWrapper,
   295  			Getter:              ar.getter,
   296  		}
   297  
   298  		if ar.cpusetManager != nil {
   299  			trConfig.CpusetCgroupPathGetter = ar.cpusetManager.CgroupPathFor(ar.id, task.Name)
   300  		}
   301  
   302  		// Create, but do not Run, the task runner
   303  		tr, err := taskrunner.NewTaskRunner(trConfig)
   304  		if err != nil {
   305  			return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err)
   306  		}
   307  
   308  		ar.tasks[task.Name] = tr
   309  	}
   310  	return nil
   311  }
   312  
   313  func (ar *allocRunner) WaitCh() <-chan struct{} {
   314  	return ar.waitCh
   315  }
   316  
   317  // Run the AllocRunner. Starts tasks if the alloc is non-terminal and closes
   318  // WaitCh when it exits. Should be started in a goroutine.
   319  func (ar *allocRunner) Run() {
   320  	// Close the wait channel on return
   321  	defer close(ar.waitCh)
   322  
   323  	// Start the task state update handler
   324  	go ar.handleTaskStateUpdates()
   325  
   326  	// Start the alloc update handler
   327  	go ar.handleAllocUpdates()
   328  
   329  	// If task update chan has been closed, that means we've been shutdown.
   330  	select {
   331  	case <-ar.taskStateUpdateHandlerCh:
   332  		return
   333  	default:
   334  	}
   335  
   336  	// When handling (potentially restored) terminal alloc, ensure tasks and post-run hooks are run
   337  	// to perform any cleanup that's necessary, potentially not done prior to earlier termination
   338  
   339  	// Run the prestart hooks if non-terminal
   340  	if ar.shouldRun() {
   341  		if err := ar.prerun(); err != nil {
   342  			ar.logger.Error("prerun failed", "error", err)
   343  
   344  			for _, tr := range ar.tasks {
   345  				tr.MarkFailedDead(fmt.Sprintf("failed to setup alloc: %v", err))
   346  			}
   347  
   348  			goto POST
   349  		}
   350  	}
   351  
   352  	// Run the runners (blocks until they exit)
   353  	ar.runTasks()
   354  
   355  POST:
   356  	if ar.isShuttingDown() {
   357  		return
   358  	}
   359  
   360  	// Run the postrun hooks
   361  	if err := ar.postrun(); err != nil {
   362  		ar.logger.Error("postrun failed", "error", err)
   363  	}
   364  
   365  }
   366  
   367  // shouldRun returns true if the alloc is in a state that the alloc runner
   368  // should run it.
   369  func (ar *allocRunner) shouldRun() bool {
   370  	// Do not run allocs that are terminal
   371  	if ar.Alloc().TerminalStatus() {
   372  		ar.logger.Trace("alloc terminal; not running",
   373  			"desired_status", ar.Alloc().DesiredStatus,
   374  			"client_status", ar.Alloc().ClientStatus,
   375  		)
   376  		return false
   377  	}
   378  
   379  	// It's possible that the alloc local state was marked terminal before
   380  	// the server copy of the alloc (checked above) was marked as terminal,
   381  	// so check the local state as well.
   382  	switch clientStatus := ar.AllocState().ClientStatus; clientStatus {
   383  	case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost:
   384  		ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus)
   385  		return false
   386  	}
   387  
   388  	return true
   389  }
   390  
   391  // runTasks is used to run the task runners and block until they exit.
   392  func (ar *allocRunner) runTasks() {
   393  	// Start and wait for all tasks.
   394  	for _, task := range ar.tasks {
   395  		go task.Run()
   396  	}
   397  	for _, task := range ar.tasks {
   398  		<-task.WaitCh()
   399  	}
   400  }
   401  
   402  // Alloc returns the current allocation being run by this runner as sent by the
   403  // server. This view of the allocation does not have updated task states.
   404  func (ar *allocRunner) Alloc() *structs.Allocation {
   405  	ar.allocLock.RLock()
   406  	defer ar.allocLock.RUnlock()
   407  	return ar.alloc
   408  }
   409  
   410  func (ar *allocRunner) setAlloc(updated *structs.Allocation) {
   411  	ar.allocLock.Lock()
   412  	ar.alloc = updated
   413  	ar.allocLock.Unlock()
   414  }
   415  
   416  // GetAllocDir returns the alloc dir which is safe for concurrent use.
   417  func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir {
   418  	return ar.allocDir
   419  }
   420  
   421  // Restore state from database. Must be called after NewAllocRunner but before
   422  // Run.
   423  func (ar *allocRunner) Restore() error {
   424  	// Retrieve deployment status to avoid reseting it across agent
   425  	// restarts. Once a deployment status is set Nomad no longer monitors
   426  	// alloc health, so we must persist deployment state across restarts.
   427  	ds, err := ar.stateDB.GetDeploymentStatus(ar.id)
   428  	if err != nil {
   429  		return err
   430  	}
   431  
   432  	ns, err := ar.stateDB.GetNetworkStatus(ar.id)
   433  	if err != nil {
   434  		return err
   435  	}
   436  
   437  	ar.stateLock.Lock()
   438  	ar.state.DeploymentStatus = ds
   439  	ar.state.NetworkStatus = ns
   440  	ar.stateLock.Unlock()
   441  
   442  	states := make(map[string]*structs.TaskState)
   443  
   444  	// Restore task runners
   445  	for _, tr := range ar.tasks {
   446  		if err := tr.Restore(); err != nil {
   447  			return err
   448  		}
   449  		states[tr.Task().Name] = tr.TaskState()
   450  	}
   451  
   452  	ar.taskCoordinator.Restore(states)
   453  
   454  	return nil
   455  }
   456  
   457  // persistDeploymentStatus stores AllocDeploymentStatus.
   458  func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) {
   459  	if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil {
   460  		// While any persistence errors are very bad, the worst case
   461  		// scenario for failing to persist deployment status is that if
   462  		// the agent is restarted it will monitor the deployment status
   463  		// again. This could cause a deployment's status to change when
   464  		// that shouldn't happen. However, allowing that seems better
   465  		// than failing the entire allocation.
   466  		ar.logger.Error("error storing deployment status", "error", err)
   467  	}
   468  }
   469  
   470  // TaskStateUpdated is called by TaskRunner when a task's state has been
   471  // updated. It does not process the update synchronously but instead notifies a
   472  // goroutine the state has change. Since processing the state change may cause
   473  // the task to be killed (thus change its state again) it cannot be done
   474  // synchronously as it would cause a deadlock due to reentrancy.
   475  //
   476  // The goroutine is used to compute changes to the alloc's ClientStatus and to
   477  // update the server with the new state.
   478  func (ar *allocRunner) TaskStateUpdated() {
   479  	select {
   480  	case ar.taskStateUpdatedCh <- struct{}{}:
   481  	default:
   482  		// already pending updates
   483  	}
   484  }
   485  
   486  // handleTaskStateUpdates must be run in goroutine as it monitors
   487  // taskStateUpdatedCh for task state update notifications and processes task
   488  // states.
   489  //
   490  // Processing task state updates must be done in a goroutine as it may have to
   491  // kill tasks which causes further task state updates.
   492  func (ar *allocRunner) handleTaskStateUpdates() {
   493  	defer close(ar.taskStateUpdateHandlerCh)
   494  
   495  	hasSidecars := hasSidecarTasks(ar.tasks)
   496  
   497  	for done := false; !done; {
   498  		select {
   499  		case <-ar.taskStateUpdatedCh:
   500  		case <-ar.waitCh:
   501  			// Run has exited, sync once more to ensure final
   502  			// states are collected.
   503  			done = true
   504  		}
   505  
   506  		ar.logger.Trace("handling task state update", "done", done)
   507  
   508  		// Set with the appropriate event if task runners should be
   509  		// killed.
   510  		var killEvent *structs.TaskEvent
   511  
   512  		// If task runners should be killed, this is set to the task
   513  		// name whose fault it is.
   514  		killTask := ""
   515  
   516  		// Task state has been updated; gather the state of the other tasks
   517  		trNum := len(ar.tasks)
   518  		liveRunners := make([]*taskrunner.TaskRunner, 0, trNum)
   519  		states := make(map[string]*structs.TaskState, trNum)
   520  
   521  		for name, tr := range ar.tasks {
   522  			taskState := tr.TaskState()
   523  			states[name] = taskState
   524  
   525  			if tr.IsPoststopTask() {
   526  				continue
   527  			}
   528  
   529  			// Capture live task runners in case we need to kill them
   530  			if taskState.State != structs.TaskStateDead {
   531  				liveRunners = append(liveRunners, tr)
   532  				continue
   533  			}
   534  
   535  			// Task is dead, determine if other tasks should be killed
   536  			if taskState.Failed {
   537  				// Only set failed event if no event has been
   538  				// set yet to give dead leaders priority.
   539  				if killEvent == nil {
   540  					killTask = name
   541  					killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed).
   542  						SetFailedSibling(name)
   543  				}
   544  			} else if tr.IsLeader() {
   545  				killEvent = structs.NewTaskEvent(structs.TaskLeaderDead)
   546  			}
   547  		}
   548  
   549  		if len(liveRunners) > 0 {
   550  			// if all live runners are sidecars - kill alloc
   551  			onlySidecarsRemaining := hasSidecars && !hasNonSidecarTasks(liveRunners)
   552  			if killEvent == nil && onlySidecarsRemaining {
   553  				killEvent = structs.NewTaskEvent(structs.TaskMainDead)
   554  			}
   555  
   556  			// If there's a kill event set and live runners, kill them
   557  			if killEvent != nil {
   558  
   559  				// Log kill reason
   560  				switch killEvent.Type {
   561  				case structs.TaskLeaderDead:
   562  					ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask)
   563  				case structs.TaskMainDead:
   564  					ar.logger.Debug("main tasks dead, destroying all sidecar tasks")
   565  				default:
   566  					ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask)
   567  				}
   568  
   569  				// Emit kill event for live runners
   570  				for _, tr := range liveRunners {
   571  					tr.EmitEvent(killEvent)
   572  				}
   573  
   574  				// Kill 'em all
   575  				states = ar.killTasks()
   576  
   577  				// Wait for TaskRunners to exit before continuing. This will
   578  				// prevent looping before TaskRunners have transitioned to
   579  				// Dead.
   580  				for _, tr := range liveRunners {
   581  					ar.logger.Info("waiting for task to exit", "task", tr.Task().Name)
   582  					select {
   583  					case <-tr.WaitCh():
   584  					case <-ar.waitCh:
   585  					}
   586  				}
   587  			}
   588  		} else {
   589  			// If there are no live runners left kill all non-poststop task
   590  			// runners to unblock them from the alloc restart loop.
   591  			for _, tr := range ar.tasks {
   592  				if tr.IsPoststopTask() {
   593  					continue
   594  				}
   595  
   596  				select {
   597  				case <-tr.WaitCh():
   598  				case <-ar.waitCh:
   599  				default:
   600  					// Kill task runner without setting an event because the
   601  					// task is already dead, it's just waiting in the alloc
   602  					// restart loop.
   603  					err := tr.Kill(context.TODO(), nil)
   604  					if err != nil {
   605  						ar.logger.Warn("failed to kill task", "task", tr.Task().Name, "error", err)
   606  					}
   607  				}
   608  			}
   609  		}
   610  
   611  		ar.taskCoordinator.TaskStateUpdated(states)
   612  
   613  		// Get the client allocation
   614  		calloc := ar.clientAlloc(states)
   615  
   616  		// Update the server
   617  		ar.stateUpdater.AllocStateUpdated(calloc)
   618  
   619  		// Broadcast client alloc to listeners
   620  		ar.allocBroadcaster.Send(calloc)
   621  	}
   622  }
   623  
   624  // hasNonSidecarTasks returns false if all the passed tasks are sidecar tasks
   625  func hasNonSidecarTasks(tasks []*taskrunner.TaskRunner) bool {
   626  	for _, tr := range tasks {
   627  		if !tr.IsSidecarTask() {
   628  			return true
   629  		}
   630  	}
   631  
   632  	return false
   633  }
   634  
   635  // hasSidecarTasks returns true if any of the passed tasks are sidecar tasks
   636  func hasSidecarTasks(tasks map[string]*taskrunner.TaskRunner) bool {
   637  	for _, tr := range tasks {
   638  		if tr.IsSidecarTask() {
   639  			return true
   640  		}
   641  	}
   642  
   643  	return false
   644  }
   645  
   646  // killTasks kills all task runners, leader (if there is one) first. Errors are
   647  // logged except taskrunner.ErrTaskNotRunning which is ignored. Task states
   648  // after Kill has been called are returned.
   649  func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
   650  	var mu sync.Mutex
   651  	states := make(map[string]*structs.TaskState, len(ar.tasks))
   652  
   653  	// run alloc prekill hooks
   654  	ar.preKillHooks()
   655  
   656  	// Kill leader first, synchronously
   657  	for name, tr := range ar.tasks {
   658  		if !tr.IsLeader() {
   659  			continue
   660  		}
   661  
   662  		taskEvent := structs.NewTaskEvent(structs.TaskKilling)
   663  		taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
   664  		err := tr.Kill(context.TODO(), taskEvent)
   665  		if err != nil && err != taskrunner.ErrTaskNotRunning {
   666  			ar.logger.Warn("error stopping leader task", "error", err, "task_name", name)
   667  		}
   668  
   669  		taskState := tr.TaskState()
   670  		states[name] = taskState
   671  		break
   672  	}
   673  
   674  	// Kill the rest non-sidecar and non-poststop tasks concurrently
   675  	wg := sync.WaitGroup{}
   676  	for name, tr := range ar.tasks {
   677  		// Filter out poststop and sidecar tasks so that they stop after all the other tasks are killed
   678  		if tr.IsLeader() || tr.IsPoststopTask() || tr.IsSidecarTask() {
   679  			continue
   680  		}
   681  
   682  		wg.Add(1)
   683  		go func(name string, tr *taskrunner.TaskRunner) {
   684  			defer wg.Done()
   685  			taskEvent := structs.NewTaskEvent(structs.TaskKilling)
   686  			taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
   687  			err := tr.Kill(context.TODO(), taskEvent)
   688  			if err != nil && err != taskrunner.ErrTaskNotRunning {
   689  				ar.logger.Warn("error stopping task", "error", err, "task_name", name)
   690  			}
   691  
   692  			taskState := tr.TaskState()
   693  			mu.Lock()
   694  			states[name] = taskState
   695  			mu.Unlock()
   696  		}(name, tr)
   697  	}
   698  	wg.Wait()
   699  
   700  	// Kill the sidecar tasks last.
   701  	for name, tr := range ar.tasks {
   702  		if !tr.IsSidecarTask() || tr.IsLeader() || tr.IsPoststopTask() {
   703  			continue
   704  		}
   705  
   706  		wg.Add(1)
   707  		go func(name string, tr *taskrunner.TaskRunner) {
   708  			defer wg.Done()
   709  			taskEvent := structs.NewTaskEvent(structs.TaskKilling)
   710  			taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
   711  			err := tr.Kill(context.TODO(), taskEvent)
   712  			if err != nil && err != taskrunner.ErrTaskNotRunning {
   713  				ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name)
   714  			}
   715  
   716  			taskState := tr.TaskState()
   717  			mu.Lock()
   718  			states[name] = taskState
   719  			mu.Unlock()
   720  		}(name, tr)
   721  	}
   722  	wg.Wait()
   723  
   724  	return states
   725  }
   726  
   727  // clientAlloc takes in the task states and returns an Allocation populated
   728  // with Client specific fields
   729  func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation {
   730  	ar.stateLock.Lock()
   731  	defer ar.stateLock.Unlock()
   732  
   733  	// store task states for AllocState to expose
   734  	ar.state.TaskStates = taskStates
   735  
   736  	a := &structs.Allocation{
   737  		ID:         ar.id,
   738  		TaskStates: taskStates,
   739  	}
   740  
   741  	if d := ar.state.DeploymentStatus; d != nil {
   742  		a.DeploymentStatus = d.Copy()
   743  	}
   744  
   745  	// Compute the ClientStatus
   746  	if ar.state.ClientStatus != "" {
   747  		// The client status is being forced
   748  		a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription
   749  	} else {
   750  		a.ClientStatus, a.ClientDescription = getClientStatus(taskStates)
   751  	}
   752  
   753  	// If the allocation is terminal, make sure all required fields are properly
   754  	// set.
   755  	if a.ClientTerminalStatus() {
   756  		alloc := ar.Alloc()
   757  
   758  		// If we are part of a deployment and the alloc has failed, mark the
   759  		// alloc as unhealthy. This guards against the watcher not be started.
   760  		// If the health status is already set then terminal allocations should not
   761  		if a.ClientStatus == structs.AllocClientStatusFailed &&
   762  			alloc.DeploymentID != "" && !a.DeploymentStatus.HasHealth() {
   763  			a.DeploymentStatus = &structs.AllocDeploymentStatus{
   764  				Healthy: pointer.Of(false),
   765  			}
   766  		}
   767  
   768  		// Make sure we have marked the finished at for every task. This is used
   769  		// to calculate the reschedule time for failed allocations.
   770  		now := time.Now()
   771  		for taskName := range ar.tasks {
   772  			ts, ok := a.TaskStates[taskName]
   773  			if !ok {
   774  				ts = &structs.TaskState{}
   775  				a.TaskStates[taskName] = ts
   776  			}
   777  			if ts.FinishedAt.IsZero() {
   778  				ts.FinishedAt = now
   779  			}
   780  		}
   781  	}
   782  
   783  	// Set the NetworkStatus and default DNSConfig if one is not returned from the client
   784  	netStatus := ar.state.NetworkStatus
   785  	if netStatus != nil {
   786  		a.NetworkStatus = netStatus
   787  	} else {
   788  		a.NetworkStatus = new(structs.AllocNetworkStatus)
   789  	}
   790  
   791  	if a.NetworkStatus.DNS == nil {
   792  		alloc := ar.Alloc()
   793  		nws := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Networks
   794  		if len(nws) > 0 {
   795  			a.NetworkStatus.DNS = nws[0].DNS.Copy()
   796  		}
   797  	}
   798  
   799  	return a
   800  }
   801  
   802  // getClientStatus takes in the task states for a given allocation and computes
   803  // the client status and description
   804  func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) {
   805  	var pending, running, dead, failed bool
   806  	for _, state := range taskStates {
   807  		switch state.State {
   808  		case structs.TaskStateRunning:
   809  			running = true
   810  		case structs.TaskStatePending:
   811  			pending = true
   812  		case structs.TaskStateDead:
   813  			if state.Failed {
   814  				failed = true
   815  			} else {
   816  				dead = true
   817  			}
   818  		}
   819  	}
   820  
   821  	// Determine the alloc status
   822  	if failed {
   823  		return structs.AllocClientStatusFailed, "Failed tasks"
   824  	} else if running {
   825  		return structs.AllocClientStatusRunning, "Tasks are running"
   826  	} else if pending {
   827  		return structs.AllocClientStatusPending, "No tasks have started"
   828  	} else if dead {
   829  		return structs.AllocClientStatusComplete, "All tasks have completed"
   830  	}
   831  
   832  	return "", ""
   833  }
   834  
   835  // SetClientStatus is a helper for forcing a specific client
   836  // status on the alloc runner. This is used during restore errors
   837  // when the task state can't be restored.
   838  func (ar *allocRunner) SetClientStatus(clientStatus string) {
   839  	ar.stateLock.Lock()
   840  	defer ar.stateLock.Unlock()
   841  	ar.state.ClientStatus = clientStatus
   842  }
   843  
   844  func (ar *allocRunner) SetNetworkStatus(s *structs.AllocNetworkStatus) {
   845  	ar.stateLock.Lock()
   846  	defer ar.stateLock.Unlock()
   847  	ar.state.NetworkStatus = s.Copy()
   848  }
   849  
   850  func (ar *allocRunner) NetworkStatus() *structs.AllocNetworkStatus {
   851  	ar.stateLock.Lock()
   852  	defer ar.stateLock.Unlock()
   853  	return ar.state.NetworkStatus.Copy()
   854  }
   855  
   856  // setIndexes is a helper for forcing alloc state on the alloc runner. This is
   857  // used during reconnect when the task has been marked unknown by the server.
   858  func (ar *allocRunner) setIndexes(update *structs.Allocation) {
   859  	ar.allocLock.Lock()
   860  	defer ar.allocLock.Unlock()
   861  	ar.alloc.AllocModifyIndex = update.AllocModifyIndex
   862  	ar.alloc.ModifyIndex = update.ModifyIndex
   863  	ar.alloc.ModifyTime = update.ModifyTime
   864  }
   865  
   866  // AllocState returns a copy of allocation state including a snapshot of task
   867  // states.
   868  func (ar *allocRunner) AllocState() *state.State {
   869  	ar.stateLock.RLock()
   870  	state := ar.state.Copy()
   871  	ar.stateLock.RUnlock()
   872  
   873  	// If TaskStateUpdated has not been called yet, ar.state.TaskStates
   874  	// won't be set as it is not the canonical source of TaskStates.
   875  	if len(state.TaskStates) == 0 {
   876  		ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks))
   877  		for k, tr := range ar.tasks {
   878  			state.TaskStates[k] = tr.TaskState()
   879  		}
   880  	}
   881  
   882  	// Generate alloc to get other state fields
   883  	alloc := ar.clientAlloc(state.TaskStates)
   884  	state.ClientStatus = alloc.ClientStatus
   885  	state.ClientDescription = alloc.ClientDescription
   886  	state.DeploymentStatus = alloc.DeploymentStatus
   887  
   888  	return state
   889  }
   890  
   891  // Update asyncronously updates the running allocation with a new version
   892  // received from the server.
   893  // When processing a new update, we will first attempt to drain stale updates
   894  // from the queue, before appending the new one.
   895  func (ar *allocRunner) Update(update *structs.Allocation) {
   896  	select {
   897  	// Drain queued update from the channel if possible, and check the modify
   898  	// index
   899  	case oldUpdate := <-ar.allocUpdatedCh:
   900  		// If the old update is newer than the replacement, then skip the new one
   901  		// and return. This case shouldn't happen, but may in the case of a bug
   902  		// elsewhere inside the system.
   903  		if oldUpdate.AllocModifyIndex > update.AllocModifyIndex {
   904  			ar.logger.Debug("Discarding allocation update due to newer alloc revision in queue",
   905  				"old_modify_index", oldUpdate.AllocModifyIndex,
   906  				"new_modify_index", update.AllocModifyIndex)
   907  			ar.allocUpdatedCh <- oldUpdate
   908  			return
   909  		} else {
   910  			ar.logger.Debug("Discarding allocation update",
   911  				"skipped_modify_index", oldUpdate.AllocModifyIndex,
   912  				"new_modify_index", update.AllocModifyIndex)
   913  		}
   914  	case <-ar.waitCh:
   915  		ar.logger.Trace("AllocRunner has terminated, skipping alloc update",
   916  			"modify_index", update.AllocModifyIndex)
   917  		return
   918  	default:
   919  	}
   920  
   921  	if update.DesiredTransition.ShouldIgnoreShutdownDelay() {
   922  		ar.shutdownDelayCancelFn()
   923  	}
   924  
   925  	// Queue the new update
   926  	ar.allocUpdatedCh <- update
   927  }
   928  
   929  func (ar *allocRunner) handleAllocUpdates() {
   930  	for {
   931  		select {
   932  		case update := <-ar.allocUpdatedCh:
   933  			ar.handleAllocUpdate(update)
   934  		case <-ar.waitCh:
   935  			return
   936  		}
   937  	}
   938  }
   939  
   940  // This method sends the updated alloc to Run for serially processing updates.
   941  // If there is already a pending update it will be discarded and replaced by
   942  // the latest update.
   943  func (ar *allocRunner) handleAllocUpdate(update *structs.Allocation) {
   944  	// Detect Stop updates
   945  	stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus()
   946  
   947  	// Update ar.alloc
   948  	ar.setAlloc(update)
   949  
   950  	// Run update hooks if not stopping or dead
   951  	if !update.TerminalStatus() {
   952  		if err := ar.update(update); err != nil {
   953  			ar.logger.Error("error running update hooks", "error", err)
   954  		}
   955  
   956  	}
   957  
   958  	// Update task runners
   959  	for _, tr := range ar.tasks {
   960  		tr.Update(update)
   961  	}
   962  
   963  	// If alloc is being terminated, kill all tasks, leader first
   964  	if stopping {
   965  		ar.killTasks()
   966  	}
   967  
   968  }
   969  
   970  func (ar *allocRunner) Listener() *cstructs.AllocListener {
   971  	return ar.allocBroadcaster.Listen()
   972  }
   973  
   974  func (ar *allocRunner) destroyImpl() {
   975  	// Stop any running tasks and persist states in case the client is
   976  	// shutdown before Destroy finishes.
   977  	states := ar.killTasks()
   978  	calloc := ar.clientAlloc(states)
   979  	ar.stateUpdater.AllocStateUpdated(calloc)
   980  
   981  	// Wait for tasks to exit and postrun hooks to finish
   982  	<-ar.waitCh
   983  
   984  	// Run destroy hooks
   985  	if err := ar.destroy(); err != nil {
   986  		ar.logger.Warn("error running destroy hooks", "error", err)
   987  	}
   988  
   989  	// Wait for task state update handler to exit before removing local
   990  	// state if Run() ran at all.
   991  	<-ar.taskStateUpdateHandlerCh
   992  
   993  	// Mark alloc as destroyed
   994  	ar.destroyedLock.Lock()
   995  
   996  	// Cleanup state db; while holding the lock to avoid
   997  	// a race periodic PersistState that may resurrect the alloc
   998  	if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil {
   999  		ar.logger.Warn("failed to delete allocation state", "error", err)
  1000  	}
  1001  
  1002  	if !ar.shutdown {
  1003  		ar.shutdown = true
  1004  		close(ar.shutdownCh)
  1005  	}
  1006  
  1007  	ar.destroyed = true
  1008  	close(ar.destroyCh)
  1009  
  1010  	ar.destroyedLock.Unlock()
  1011  }
  1012  
  1013  func (ar *allocRunner) PersistState() error {
  1014  	ar.destroyedLock.Lock()
  1015  	defer ar.destroyedLock.Unlock()
  1016  
  1017  	if ar.destroyed {
  1018  		err := ar.stateDB.DeleteAllocationBucket(ar.id, cstate.WithBatchMode())
  1019  		if err != nil {
  1020  			ar.logger.Warn("failed to delete allocation bucket", "error", err)
  1021  		}
  1022  		return nil
  1023  	}
  1024  
  1025  	// persist network status, wrapping in a func to release state lock as early as possible
  1026  	err := func() error {
  1027  		ar.stateLock.Lock()
  1028  		defer ar.stateLock.Unlock()
  1029  		if ar.state.NetworkStatus != nil {
  1030  			err := ar.stateDB.PutNetworkStatus(ar.id, ar.state.NetworkStatus, cstate.WithBatchMode())
  1031  			if err != nil {
  1032  				return err
  1033  			}
  1034  		}
  1035  		return nil
  1036  	}()
  1037  	if err != nil {
  1038  		return err
  1039  	}
  1040  
  1041  	// TODO: consider persisting deployment state along with task status.
  1042  	// While we study why only the alloc is persisted, I opted to maintain current
  1043  	// behavior and not risk adding yet more IO calls unnecessarily.
  1044  	return ar.stateDB.PutAllocation(ar.Alloc(), cstate.WithBatchMode())
  1045  }
  1046  
  1047  // Destroy the alloc runner by stopping it if it is still running and cleaning
  1048  // up all of its resources.
  1049  //
  1050  // This method is safe for calling concurrently with Run() and will cause it to
  1051  // exit (thus closing WaitCh).
  1052  // When the destroy action is completed, it will close DestroyCh().
  1053  func (ar *allocRunner) Destroy() {
  1054  	ar.destroyedLock.Lock()
  1055  	defer ar.destroyedLock.Unlock()
  1056  
  1057  	if ar.destroyed {
  1058  		// Only destroy once
  1059  		return
  1060  	}
  1061  
  1062  	if ar.destroyLaunched {
  1063  		// Only dispatch a destroy once
  1064  		return
  1065  	}
  1066  
  1067  	ar.destroyLaunched = true
  1068  
  1069  	// Synchronize calls to shutdown/destroy
  1070  	if ar.shutdownLaunched {
  1071  		go func() {
  1072  			ar.logger.Debug("Waiting for shutdown before destroying runner")
  1073  			<-ar.shutdownCh
  1074  			ar.destroyImpl()
  1075  		}()
  1076  
  1077  		return
  1078  	}
  1079  
  1080  	go ar.destroyImpl()
  1081  }
  1082  
  1083  // IsDestroyed returns true if the alloc runner has been destroyed (stopped and
  1084  // garbage collected).
  1085  //
  1086  // This method is safe for calling concurrently with Run(). Callers must
  1087  // receive on WaitCh() to block until alloc runner has stopped and been
  1088  // destroyed.
  1089  func (ar *allocRunner) IsDestroyed() bool {
  1090  	ar.destroyedLock.Lock()
  1091  	defer ar.destroyedLock.Unlock()
  1092  	return ar.destroyed
  1093  }
  1094  
  1095  // IsWaiting returns true if the alloc runner is waiting for its previous
  1096  // allocation to terminate.
  1097  //
  1098  // This method is safe for calling concurrently with Run().
  1099  func (ar *allocRunner) IsWaiting() bool {
  1100  	return ar.prevAllocWatcher.IsWaiting()
  1101  }
  1102  
  1103  // isShuttingDown returns true if the alloc runner is in a shutdown state
  1104  // due to a call to Shutdown() or Destroy()
  1105  func (ar *allocRunner) isShuttingDown() bool {
  1106  	ar.destroyedLock.Lock()
  1107  	defer ar.destroyedLock.Unlock()
  1108  	return ar.shutdownLaunched
  1109  }
  1110  
  1111  // DestroyCh is a channel that is closed when an allocrunner is closed due to
  1112  // an explicit call to Destroy().
  1113  func (ar *allocRunner) DestroyCh() <-chan struct{} {
  1114  	return ar.destroyCh
  1115  }
  1116  
  1117  // ShutdownCh is a channel that is closed when an allocrunner is closed due to
  1118  // either an explicit call to Shutdown(), or Destroy().
  1119  func (ar *allocRunner) ShutdownCh() <-chan struct{} {
  1120  	return ar.shutdownCh
  1121  }
  1122  
  1123  // Shutdown AllocRunner gracefully. Asynchronously shuts down all TaskRunners.
  1124  // Tasks are unaffected and may be restored.
  1125  // When the destroy action is completed, it will close ShutdownCh().
  1126  func (ar *allocRunner) Shutdown() {
  1127  	ar.destroyedLock.Lock()
  1128  	defer ar.destroyedLock.Unlock()
  1129  
  1130  	// Destroy is a superset of Shutdown so there's nothing to do if this
  1131  	// has already been destroyed.
  1132  	if ar.destroyed {
  1133  		return
  1134  	}
  1135  
  1136  	// Destroy is a superset of Shutdown so if it's been marked for destruction,
  1137  	// don't try and shutdown in parallel. If shutdown has been launched, don't
  1138  	// try again.
  1139  	if ar.destroyLaunched || ar.shutdownLaunched {
  1140  		return
  1141  	}
  1142  
  1143  	ar.shutdownLaunched = true
  1144  
  1145  	go func() {
  1146  		ar.logger.Trace("shutting down")
  1147  
  1148  		// Shutdown tasks gracefully if they were run
  1149  		wg := sync.WaitGroup{}
  1150  		for _, tr := range ar.tasks {
  1151  			wg.Add(1)
  1152  			go func(tr *taskrunner.TaskRunner) {
  1153  				tr.Shutdown()
  1154  				wg.Done()
  1155  			}(tr)
  1156  		}
  1157  		wg.Wait()
  1158  
  1159  		// Wait for Run to exit
  1160  		<-ar.waitCh
  1161  
  1162  		// Run shutdown hooks
  1163  		ar.shutdownHooks()
  1164  
  1165  		// Wait for updater to finish its final run
  1166  		<-ar.taskStateUpdateHandlerCh
  1167  
  1168  		ar.destroyedLock.Lock()
  1169  		ar.shutdown = true
  1170  		close(ar.shutdownCh)
  1171  		ar.destroyedLock.Unlock()
  1172  	}()
  1173  }
  1174  
  1175  // IsMigrating returns true if the alloc runner is migrating data from its
  1176  // previous allocation.
  1177  //
  1178  // This method is safe for calling concurrently with Run().
  1179  func (ar *allocRunner) IsMigrating() bool {
  1180  	return ar.prevAllocMigrator.IsMigrating()
  1181  }
  1182  
  1183  func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter {
  1184  	return ar
  1185  }
  1186  
  1187  // LatestAllocStats returns the latest stats for an allocation. If taskFilter
  1188  // is set, only stats for that task -- if it exists -- are returned.
  1189  func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) {
  1190  	astat := &cstructs.AllocResourceUsage{
  1191  		Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)),
  1192  		ResourceUsage: &cstructs.ResourceUsage{
  1193  			MemoryStats: &cstructs.MemoryStats{},
  1194  			CpuStats:    &cstructs.CpuStats{},
  1195  			DeviceStats: []*device.DeviceGroupStats{},
  1196  		},
  1197  	}
  1198  
  1199  	for name, tr := range ar.tasks {
  1200  		if taskFilter != "" && taskFilter != name {
  1201  			// Getting stats for a particular task and its not this one!
  1202  			continue
  1203  		}
  1204  
  1205  		if usage := tr.LatestResourceUsage(); usage != nil {
  1206  			astat.Tasks[name] = usage
  1207  			astat.ResourceUsage.Add(usage.ResourceUsage)
  1208  			if usage.Timestamp > astat.Timestamp {
  1209  				astat.Timestamp = usage.Timestamp
  1210  			}
  1211  		}
  1212  	}
  1213  
  1214  	return astat, nil
  1215  }
  1216  
  1217  func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventHandler {
  1218  	if tr, ok := ar.tasks[taskName]; ok {
  1219  		return func(ev *drivers.TaskEvent) {
  1220  			tr.EmitEvent(&structs.TaskEvent{
  1221  				Type:          structs.TaskDriverMessage,
  1222  				Time:          ev.Timestamp.UnixNano(),
  1223  				Details:       ev.Annotations,
  1224  				DriverMessage: ev.Message,
  1225  			})
  1226  		}
  1227  	}
  1228  	return nil
  1229  }
  1230  
  1231  // Restart satisfies the WorkloadRestarter interface and restarts all tasks
  1232  // that are currently running.
  1233  func (ar *allocRunner) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error {
  1234  	return ar.restartTasks(ctx, event, failure, false)
  1235  }
  1236  
  1237  // RestartTask restarts the provided task.
  1238  func (ar *allocRunner) RestartTask(taskName string, event *structs.TaskEvent) error {
  1239  	tr, ok := ar.tasks[taskName]
  1240  	if !ok {
  1241  		return fmt.Errorf("Could not find task runner for task: %s", taskName)
  1242  	}
  1243  
  1244  	return tr.Restart(context.TODO(), event, false)
  1245  }
  1246  
  1247  // RestartRunning restarts all tasks that are currently running.
  1248  func (ar *allocRunner) RestartRunning(event *structs.TaskEvent) error {
  1249  	return ar.restartTasks(context.TODO(), event, false, false)
  1250  }
  1251  
  1252  // RestartAll restarts all tasks in the allocation, including dead ones. They
  1253  // will restart following their lifecycle order.
  1254  func (ar *allocRunner) RestartAll(event *structs.TaskEvent) error {
  1255  	// Restart the taskCoordinator to allow dead tasks to run again.
  1256  	ar.taskCoordinator.Restart()
  1257  	return ar.restartTasks(context.TODO(), event, false, true)
  1258  }
  1259  
  1260  // restartTasks restarts all task runners concurrently.
  1261  func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool, force bool) error {
  1262  	waitCh := make(chan struct{})
  1263  	var err *multierror.Error
  1264  	var errMutex sync.Mutex
  1265  
  1266  	// run alloc task restart hooks
  1267  	ar.taskRestartHooks()
  1268  
  1269  	go func() {
  1270  		var wg sync.WaitGroup
  1271  		defer close(waitCh)
  1272  		for tn, tr := range ar.tasks {
  1273  			wg.Add(1)
  1274  			go func(taskName string, taskRunner *taskrunner.TaskRunner) {
  1275  				defer wg.Done()
  1276  
  1277  				var e error
  1278  				if force {
  1279  					e = taskRunner.ForceRestart(ctx, event.Copy(), failure)
  1280  				} else {
  1281  					e = taskRunner.Restart(ctx, event.Copy(), failure)
  1282  				}
  1283  
  1284  				// Ignore ErrTaskNotRunning errors since tasks that are not
  1285  				// running are expected to not be restarted.
  1286  				if e != nil && e != taskrunner.ErrTaskNotRunning {
  1287  					errMutex.Lock()
  1288  					defer errMutex.Unlock()
  1289  					err = multierror.Append(err, fmt.Errorf("failed to restart task %s: %v", taskName, e))
  1290  				}
  1291  			}(tn, tr)
  1292  		}
  1293  		wg.Wait()
  1294  	}()
  1295  
  1296  	select {
  1297  	case <-waitCh:
  1298  	case <-ctx.Done():
  1299  	}
  1300  
  1301  	return err.ErrorOrNil()
  1302  }
  1303  
  1304  // Signal sends a signal request to task runners inside an allocation. If the
  1305  // taskName is empty, then it is sent to all tasks.
  1306  func (ar *allocRunner) Signal(taskName, signal string) error {
  1307  	event := structs.NewTaskEvent(structs.TaskSignaling).SetSignalText(signal)
  1308  
  1309  	if taskName != "" {
  1310  		tr, ok := ar.tasks[taskName]
  1311  		if !ok {
  1312  			return fmt.Errorf("Task not found")
  1313  		}
  1314  
  1315  		return tr.Signal(event, signal)
  1316  	}
  1317  
  1318  	var err *multierror.Error
  1319  
  1320  	for tn, tr := range ar.tasks {
  1321  		rerr := tr.Signal(event.Copy(), signal)
  1322  		if rerr != nil {
  1323  			err = multierror.Append(err, fmt.Errorf("Failed to signal task: %s, err: %v", tn, rerr))
  1324  		}
  1325  	}
  1326  
  1327  	return err.ErrorOrNil()
  1328  }
  1329  
  1330  // Reconnect logs a reconnect event for each task in the allocation and syncs the current alloc state with the server.
  1331  func (ar *allocRunner) Reconnect(update *structs.Allocation) (err error) {
  1332  	event := structs.NewTaskEvent(structs.TaskClientReconnected)
  1333  	event.Time = time.Now().UnixNano()
  1334  	for _, tr := range ar.tasks {
  1335  		tr.AppendEvent(event)
  1336  	}
  1337  
  1338  	// Update the client alloc with the server side indexes.
  1339  	ar.setIndexes(update)
  1340  
  1341  	// Calculate alloc state to get the final state with the new events.
  1342  	// Cannot rely on AllocStates as it won't recompute TaskStates once they are set.
  1343  	states := make(map[string]*structs.TaskState, len(ar.tasks))
  1344  	for name, tr := range ar.tasks {
  1345  		states[name] = tr.TaskState()
  1346  	}
  1347  
  1348  	// Build the client allocation
  1349  	alloc := ar.clientAlloc(states)
  1350  
  1351  	// Update the client state store.
  1352  	err = ar.stateUpdater.PutAllocation(alloc)
  1353  	if err != nil {
  1354  		return
  1355  	}
  1356  
  1357  	// Update the server.
  1358  	ar.stateUpdater.AllocStateUpdated(alloc)
  1359  
  1360  	// Broadcast client alloc to listeners.
  1361  	err = ar.allocBroadcaster.Send(alloc)
  1362  
  1363  	return
  1364  }
  1365  
  1366  func (ar *allocRunner) GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler {
  1367  	tr, ok := ar.tasks[taskName]
  1368  	if !ok {
  1369  		return nil
  1370  	}
  1371  
  1372  	return tr.TaskExecHandler()
  1373  }
  1374  
  1375  func (ar *allocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) {
  1376  	tr, ok := ar.tasks[taskName]
  1377  	if !ok {
  1378  		return nil, fmt.Errorf("task not found")
  1379  	}
  1380  
  1381  	return tr.DriverCapabilities()
  1382  }