github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/client/allocrunner/alloc_runner.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"path/filepath"
     7  	"sync"
     8  	"time"
     9  
    10  	log "github.com/hashicorp/go-hclog"
    11  	multierror "github.com/hashicorp/go-multierror"
    12  	"github.com/hashicorp/nomad/client/allocdir"
    13  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    14  	"github.com/hashicorp/nomad/client/allocrunner/state"
    15  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
    16  	"github.com/hashicorp/nomad/client/allocwatcher"
    17  	"github.com/hashicorp/nomad/client/config"
    18  	"github.com/hashicorp/nomad/client/consul"
    19  	"github.com/hashicorp/nomad/client/devicemanager"
    20  	cinterfaces "github.com/hashicorp/nomad/client/interfaces"
    21  	"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
    22  	cstate "github.com/hashicorp/nomad/client/state"
    23  	cstructs "github.com/hashicorp/nomad/client/structs"
    24  	"github.com/hashicorp/nomad/client/vaultclient"
    25  	"github.com/hashicorp/nomad/helper"
    26  	"github.com/hashicorp/nomad/nomad/structs"
    27  	"github.com/hashicorp/nomad/plugins/device"
    28  	"github.com/hashicorp/nomad/plugins/drivers"
    29  )
    30  
    31  // allocRunner is used to run all the tasks in a given allocation
    32  type allocRunner struct {
    33  	// id is the ID of the allocation. Can be accessed without a lock
    34  	id string
    35  
    36  	// Logger is the logger for the alloc runner.
    37  	logger log.Logger
    38  
    39  	clientConfig *config.Config
    40  
    41  	// stateUpdater is used to emit updated alloc state
    42  	stateUpdater cinterfaces.AllocStateHandler
    43  
    44  	// taskStateUpdatedCh is ticked whenever task state as changed. Must
    45  	// have len==1 to allow nonblocking notification of state updates while
    46  	// the goroutine is already processing a previous update.
    47  	taskStateUpdatedCh chan struct{}
    48  
    49  	// taskStateUpdateHandlerCh is closed when the task state handling
    50  	// goroutine exits. It is unsafe to destroy the local allocation state
    51  	// before this goroutine exits.
    52  	taskStateUpdateHandlerCh chan struct{}
    53  
    54  	// allocUpdatedCh is a channel that is used to stream allocation updates into
    55  	// the allocUpdate handler. Must have len==1 to allow nonblocking notification
    56  	// of new allocation updates while the goroutine is processing a previous
    57  	// update.
    58  	allocUpdatedCh chan *structs.Allocation
    59  
    60  	// consulClient is the client used by the consul service hook for
    61  	// registering services and checks
    62  	consulClient consul.ConsulServiceAPI
    63  
    64  	// vaultClient is the used to manage Vault tokens
    65  	vaultClient vaultclient.VaultClient
    66  
    67  	// waitCh is closed when the Run loop has exited
    68  	waitCh chan struct{}
    69  
    70  	// destroyed is true when the Run loop has exited, postrun hooks have
    71  	// run, and alloc runner has been destroyed. Must acquire destroyedLock
    72  	// to access.
    73  	destroyed bool
    74  
    75  	// destroyCh is closed when the Run loop has exited, postrun hooks have
    76  	// run, and alloc runner has been destroyed.
    77  	destroyCh chan struct{}
    78  
    79  	// shutdown is true when the Run loop has exited, and shutdown hooks have
    80  	// run. Must acquire destroyedLock to access.
    81  	shutdown bool
    82  
    83  	// shutdownCh is closed when the Run loop has exited, and shutdown hooks
    84  	// have run.
    85  	shutdownCh chan struct{}
    86  
    87  	// destroyLaunched is true if Destroy has been called. Must acquire
    88  	// destroyedLock to access.
    89  	destroyLaunched bool
    90  
    91  	// shutdownLaunched is true if Shutdown has been called. Must acquire
    92  	// destroyedLock to access.
    93  	shutdownLaunched bool
    94  
    95  	// destroyedLock guards destroyed, destroyLaunched, shutdownLaunched,
    96  	// and serializes Shutdown/Destroy calls.
    97  	destroyedLock sync.Mutex
    98  
    99  	// Alloc captures the allocation being run.
   100  	alloc     *structs.Allocation
   101  	allocLock sync.RWMutex
   102  
   103  	// state is the alloc runner's state
   104  	state     *state.State
   105  	stateLock sync.RWMutex
   106  
   107  	stateDB cstate.StateDB
   108  
   109  	// allocDir is used to build the allocations directory structure.
   110  	allocDir *allocdir.AllocDir
   111  
   112  	// runnerHooks are alloc runner lifecycle hooks that should be run on state
   113  	// transistions.
   114  	runnerHooks []interfaces.RunnerHook
   115  
   116  	// tasks are the set of task runners
   117  	tasks map[string]*taskrunner.TaskRunner
   118  
   119  	// deviceStatsReporter is used to lookup resource usage for alloc devices
   120  	deviceStatsReporter cinterfaces.DeviceStatsReporter
   121  
   122  	// allocBroadcaster sends client allocation updates to all listeners
   123  	allocBroadcaster *cstructs.AllocBroadcaster
   124  
   125  	// prevAllocWatcher allows waiting for any previous or preempted allocations
   126  	// to exit
   127  	prevAllocWatcher allocwatcher.PrevAllocWatcher
   128  
   129  	// prevAllocMigrator allows the migration of a previous allocations alloc dir.
   130  	prevAllocMigrator allocwatcher.PrevAllocMigrator
   131  
   132  	// devicemanager is used to mount devices as well as lookup device
   133  	// statistics
   134  	devicemanager devicemanager.Manager
   135  
   136  	// driverManager is responsible for dispensing driver plugins and registering
   137  	// event handlers
   138  	driverManager drivermanager.Manager
   139  
   140  	// serversContactedCh is passed to TaskRunners so they can detect when
   141  	// servers have been contacted for the first time in case of a failed
   142  	// restore.
   143  	serversContactedCh chan struct{}
   144  }
   145  
   146  // NewAllocRunner returns a new allocation runner.
   147  func NewAllocRunner(config *Config) (*allocRunner, error) {
   148  	alloc := config.Alloc
   149  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   150  	if tg == nil {
   151  		return nil, fmt.Errorf("failed to lookup task group %q", alloc.TaskGroup)
   152  	}
   153  
   154  	ar := &allocRunner{
   155  		id:                       alloc.ID,
   156  		alloc:                    alloc,
   157  		clientConfig:             config.ClientConfig,
   158  		consulClient:             config.Consul,
   159  		vaultClient:              config.Vault,
   160  		tasks:                    make(map[string]*taskrunner.TaskRunner, len(tg.Tasks)),
   161  		waitCh:                   make(chan struct{}),
   162  		destroyCh:                make(chan struct{}),
   163  		shutdownCh:               make(chan struct{}),
   164  		state:                    &state.State{},
   165  		stateDB:                  config.StateDB,
   166  		stateUpdater:             config.StateUpdater,
   167  		taskStateUpdatedCh:       make(chan struct{}, 1),
   168  		taskStateUpdateHandlerCh: make(chan struct{}),
   169  		allocUpdatedCh:           make(chan *structs.Allocation, 1),
   170  		deviceStatsReporter:      config.DeviceStatsReporter,
   171  		prevAllocWatcher:         config.PrevAllocWatcher,
   172  		prevAllocMigrator:        config.PrevAllocMigrator,
   173  		devicemanager:            config.DeviceManager,
   174  		driverManager:            config.DriverManager,
   175  		serversContactedCh:       config.ServersContactedCh,
   176  	}
   177  
   178  	// Create the logger based on the allocation ID
   179  	ar.logger = config.Logger.Named("alloc_runner").With("alloc_id", alloc.ID)
   180  
   181  	// Create alloc broadcaster
   182  	ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger)
   183  
   184  	// Create alloc dir
   185  	ar.allocDir = allocdir.NewAllocDir(ar.logger, filepath.Join(config.ClientConfig.AllocDir, alloc.ID))
   186  
   187  	// Initialize the runners hooks.
   188  	ar.initRunnerHooks()
   189  
   190  	// Create the TaskRunners
   191  	if err := ar.initTaskRunners(tg.Tasks); err != nil {
   192  		return nil, err
   193  	}
   194  
   195  	return ar, nil
   196  }
   197  
   198  // initTaskRunners creates task runners but does *not* run them.
   199  func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
   200  	for _, task := range tasks {
   201  		config := &taskrunner.Config{
   202  			Alloc:               ar.alloc,
   203  			ClientConfig:        ar.clientConfig,
   204  			Task:                task,
   205  			TaskDir:             ar.allocDir.NewTaskDir(task.Name),
   206  			Logger:              ar.logger,
   207  			StateDB:             ar.stateDB,
   208  			StateUpdater:        ar,
   209  			Consul:              ar.consulClient,
   210  			Vault:               ar.vaultClient,
   211  			DeviceStatsReporter: ar.deviceStatsReporter,
   212  			DeviceManager:       ar.devicemanager,
   213  			DriverManager:       ar.driverManager,
   214  			ServersContactedCh:  ar.serversContactedCh,
   215  		}
   216  
   217  		// Create, but do not Run, the task runner
   218  		tr, err := taskrunner.NewTaskRunner(config)
   219  		if err != nil {
   220  			return fmt.Errorf("failed creating runner for task %q: %v", task.Name, err)
   221  		}
   222  
   223  		ar.tasks[task.Name] = tr
   224  	}
   225  	return nil
   226  }
   227  
   228  func (ar *allocRunner) WaitCh() <-chan struct{} {
   229  	return ar.waitCh
   230  }
   231  
   232  // Run the AllocRunner. Starts tasks if the alloc is non-terminal and closes
   233  // WaitCh when it exits. Should be started in a goroutine.
   234  func (ar *allocRunner) Run() {
   235  	// Close the wait channel on return
   236  	defer close(ar.waitCh)
   237  
   238  	// Start the task state update handler
   239  	go ar.handleTaskStateUpdates()
   240  
   241  	// Start the alloc update handler
   242  	go ar.handleAllocUpdates()
   243  
   244  	// If task update chan has been closed, that means we've been shutdown.
   245  	select {
   246  	case <-ar.taskStateUpdateHandlerCh:
   247  		return
   248  	default:
   249  	}
   250  
   251  	// Run the prestart hooks if non-terminal
   252  	if ar.shouldRun() {
   253  		if err := ar.prerun(); err != nil {
   254  			ar.logger.Error("prerun failed", "error", err)
   255  			goto POST
   256  		}
   257  	}
   258  
   259  	// Run the runners (blocks until they exit)
   260  	ar.runTasks()
   261  
   262  POST:
   263  	// Run the postrun hooks
   264  	if err := ar.postrun(); err != nil {
   265  		ar.logger.Error("postrun failed", "error", err)
   266  	}
   267  
   268  }
   269  
   270  // shouldRun returns true if the alloc is in a state that the alloc runner
   271  // should run it.
   272  func (ar *allocRunner) shouldRun() bool {
   273  	// Do not run allocs that are terminal
   274  	if ar.Alloc().TerminalStatus() {
   275  		ar.logger.Trace("alloc terminal; not running",
   276  			"desired_status", ar.Alloc().DesiredStatus,
   277  			"client_status", ar.Alloc().ClientStatus,
   278  		)
   279  		return false
   280  	}
   281  
   282  	// It's possible that the alloc local state was marked terminal before
   283  	// the server copy of the alloc (checked above) was marked as terminal,
   284  	// so check the local state as well.
   285  	switch clientStatus := ar.AllocState().ClientStatus; clientStatus {
   286  	case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed, structs.AllocClientStatusLost:
   287  		ar.logger.Trace("alloc terminal; updating server and not running", "status", clientStatus)
   288  		return false
   289  	}
   290  
   291  	return true
   292  }
   293  
   294  // runTasks is used to run the task runners and block until they exit.
   295  func (ar *allocRunner) runTasks() {
   296  	for _, task := range ar.tasks {
   297  		go task.Run()
   298  	}
   299  
   300  	for _, task := range ar.tasks {
   301  		<-task.WaitCh()
   302  	}
   303  }
   304  
   305  // Alloc returns the current allocation being run by this runner as sent by the
   306  // server. This view of the allocation does not have updated task states.
   307  func (ar *allocRunner) Alloc() *structs.Allocation {
   308  	ar.allocLock.RLock()
   309  	defer ar.allocLock.RUnlock()
   310  	return ar.alloc
   311  }
   312  
   313  func (ar *allocRunner) setAlloc(updated *structs.Allocation) {
   314  	ar.allocLock.Lock()
   315  	ar.alloc = updated
   316  	ar.allocLock.Unlock()
   317  }
   318  
   319  // GetAllocDir returns the alloc dir which is safe for concurrent use.
   320  func (ar *allocRunner) GetAllocDir() *allocdir.AllocDir {
   321  	return ar.allocDir
   322  }
   323  
   324  // Restore state from database. Must be called after NewAllocRunner but before
   325  // Run.
   326  func (ar *allocRunner) Restore() error {
   327  	// Retrieve deployment status to avoid reseting it across agent
   328  	// restarts. Once a deployment status is set Nomad no longer monitors
   329  	// alloc health, so we must persist deployment state across restarts.
   330  	ds, err := ar.stateDB.GetDeploymentStatus(ar.id)
   331  	if err != nil {
   332  		return err
   333  	}
   334  
   335  	ar.stateLock.Lock()
   336  	ar.state.DeploymentStatus = ds
   337  	ar.stateLock.Unlock()
   338  
   339  	// Restore task runners
   340  	for _, tr := range ar.tasks {
   341  		if err := tr.Restore(); err != nil {
   342  			return err
   343  		}
   344  	}
   345  
   346  	return nil
   347  }
   348  
   349  // persistDeploymentStatus stores AllocDeploymentStatus.
   350  func (ar *allocRunner) persistDeploymentStatus(ds *structs.AllocDeploymentStatus) {
   351  	if err := ar.stateDB.PutDeploymentStatus(ar.id, ds); err != nil {
   352  		// While any persistence errors are very bad, the worst case
   353  		// scenario for failing to persist deployment status is that if
   354  		// the agent is restarted it will monitor the deployment status
   355  		// again. This could cause a deployment's status to change when
   356  		// that shouldn't happen. However, allowing that seems better
   357  		// than failing the entire allocation.
   358  		ar.logger.Error("error storing deployment status", "error", err)
   359  	}
   360  }
   361  
   362  // TaskStateUpdated is called by TaskRunner when a task's state has been
   363  // updated. It does not process the update synchronously but instead notifies a
   364  // goroutine the state has change. Since processing the state change may cause
   365  // the task to be killed (thus change its state again) it cannot be done
   366  // synchronously as it would cause a deadlock due to reentrancy.
   367  //
   368  // The goroutine is used to compute changes to the alloc's ClientStatus and to
   369  // update the server with the new state.
   370  func (ar *allocRunner) TaskStateUpdated() {
   371  	select {
   372  	case ar.taskStateUpdatedCh <- struct{}{}:
   373  	default:
   374  		// already pending updates
   375  	}
   376  }
   377  
   378  // handleTaskStateUpdates must be run in goroutine as it monitors
   379  // taskStateUpdatedCh for task state update notifications and processes task
   380  // states.
   381  //
   382  // Processing task state updates must be done in a goroutine as it may have to
   383  // kill tasks which causes further task state updates.
   384  func (ar *allocRunner) handleTaskStateUpdates() {
   385  	defer close(ar.taskStateUpdateHandlerCh)
   386  
   387  	for done := false; !done; {
   388  		select {
   389  		case <-ar.taskStateUpdatedCh:
   390  		case <-ar.waitCh:
   391  			// Run has exited, sync once more to ensure final
   392  			// states are collected.
   393  			done = true
   394  		}
   395  
   396  		ar.logger.Trace("handling task state update", "done", done)
   397  
   398  		// Set with the appropriate event if task runners should be
   399  		// killed.
   400  		var killEvent *structs.TaskEvent
   401  
   402  		// If task runners should be killed, this is set to the task
   403  		// name whose fault it is.
   404  		killTask := ""
   405  
   406  		// True if task runners should be killed because a leader
   407  		// failed (informational).
   408  		leaderFailed := false
   409  
   410  		// Task state has been updated; gather the state of the other tasks
   411  		trNum := len(ar.tasks)
   412  		liveRunners := make([]*taskrunner.TaskRunner, 0, trNum)
   413  		states := make(map[string]*structs.TaskState, trNum)
   414  
   415  		for name, tr := range ar.tasks {
   416  			state := tr.TaskState()
   417  			states[name] = state
   418  
   419  			// Capture live task runners in case we need to kill them
   420  			if state.State != structs.TaskStateDead {
   421  				liveRunners = append(liveRunners, tr)
   422  				continue
   423  			}
   424  
   425  			// Task is dead, determine if other tasks should be killed
   426  			if state.Failed {
   427  				// Only set failed event if no event has been
   428  				// set yet to give dead leaders priority.
   429  				if killEvent == nil {
   430  					killTask = name
   431  					killEvent = structs.NewTaskEvent(structs.TaskSiblingFailed).
   432  						SetFailedSibling(name)
   433  				}
   434  			} else if tr.IsLeader() {
   435  				killEvent = structs.NewTaskEvent(structs.TaskLeaderDead)
   436  				leaderFailed = true
   437  				killTask = name
   438  			}
   439  		}
   440  
   441  		// If there's a kill event set and live runners, kill them
   442  		if killEvent != nil && len(liveRunners) > 0 {
   443  
   444  			// Log kill reason
   445  			if leaderFailed {
   446  				ar.logger.Debug("leader task dead, destroying all tasks", "leader_task", killTask)
   447  			} else {
   448  				ar.logger.Debug("task failure, destroying all tasks", "failed_task", killTask)
   449  			}
   450  
   451  			// Emit kill event for live runners
   452  			for _, tr := range liveRunners {
   453  				tr.EmitEvent(killEvent)
   454  			}
   455  
   456  			// Kill 'em all
   457  			states = ar.killTasks()
   458  
   459  			// Wait for TaskRunners to exit before continuing to
   460  			// prevent looping before TaskRunners have transitioned
   461  			// to Dead.
   462  			for _, tr := range liveRunners {
   463  				select {
   464  				case <-tr.WaitCh():
   465  				case <-ar.waitCh:
   466  				}
   467  			}
   468  		}
   469  
   470  		// Get the client allocation
   471  		calloc := ar.clientAlloc(states)
   472  
   473  		// Update the server
   474  		ar.stateUpdater.AllocStateUpdated(calloc)
   475  
   476  		// Broadcast client alloc to listeners
   477  		ar.allocBroadcaster.Send(calloc)
   478  	}
   479  }
   480  
   481  // killTasks kills all task runners, leader (if there is one) first. Errors are
   482  // logged except taskrunner.ErrTaskNotRunning which is ignored. Task states
   483  // after Kill has been called are returned.
   484  func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
   485  	var mu sync.Mutex
   486  	states := make(map[string]*structs.TaskState, len(ar.tasks))
   487  
   488  	// Kill leader first, synchronously
   489  	for name, tr := range ar.tasks {
   490  		if !tr.IsLeader() {
   491  			continue
   492  		}
   493  
   494  		err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilling))
   495  		if err != nil && err != taskrunner.ErrTaskNotRunning {
   496  			ar.logger.Warn("error stopping leader task", "error", err, "task_name", name)
   497  		}
   498  
   499  		state := tr.TaskState()
   500  		states[name] = state
   501  		break
   502  	}
   503  
   504  	// Kill the rest concurrently
   505  	wg := sync.WaitGroup{}
   506  	for name, tr := range ar.tasks {
   507  		if tr.IsLeader() {
   508  			continue
   509  		}
   510  
   511  		wg.Add(1)
   512  		go func(name string, tr *taskrunner.TaskRunner) {
   513  			defer wg.Done()
   514  			err := tr.Kill(context.TODO(), structs.NewTaskEvent(structs.TaskKilling))
   515  			if err != nil && err != taskrunner.ErrTaskNotRunning {
   516  				ar.logger.Warn("error stopping task", "error", err, "task_name", name)
   517  			}
   518  
   519  			state := tr.TaskState()
   520  			mu.Lock()
   521  			states[name] = state
   522  			mu.Unlock()
   523  		}(name, tr)
   524  	}
   525  	wg.Wait()
   526  
   527  	return states
   528  }
   529  
   530  // clientAlloc takes in the task states and returns an Allocation populated
   531  // with Client specific fields
   532  func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *structs.Allocation {
   533  	ar.stateLock.Lock()
   534  	defer ar.stateLock.Unlock()
   535  
   536  	// store task states for AllocState to expose
   537  	ar.state.TaskStates = taskStates
   538  
   539  	a := &structs.Allocation{
   540  		ID:         ar.id,
   541  		TaskStates: taskStates,
   542  	}
   543  
   544  	if d := ar.state.DeploymentStatus; d != nil {
   545  		a.DeploymentStatus = d.Copy()
   546  	}
   547  
   548  	// Compute the ClientStatus
   549  	if ar.state.ClientStatus != "" {
   550  		// The client status is being forced
   551  		a.ClientStatus, a.ClientDescription = ar.state.ClientStatus, ar.state.ClientDescription
   552  	} else {
   553  		a.ClientStatus, a.ClientDescription = getClientStatus(taskStates)
   554  	}
   555  
   556  	// If the allocation is terminal, make sure all required fields are properly
   557  	// set.
   558  	if a.ClientTerminalStatus() {
   559  		alloc := ar.Alloc()
   560  
   561  		// If we are part of a deployment and the alloc has failed, mark the
   562  		// alloc as unhealthy. This guards against the watcher not be started.
   563  		// If the health status is already set then terminal allocations should not
   564  		if a.ClientStatus == structs.AllocClientStatusFailed &&
   565  			alloc.DeploymentID != "" && !a.DeploymentStatus.HasHealth() {
   566  			a.DeploymentStatus = &structs.AllocDeploymentStatus{
   567  				Healthy: helper.BoolToPtr(false),
   568  			}
   569  		}
   570  
   571  		// Make sure we have marked the finished at for every task. This is used
   572  		// to calculate the reschedule time for failed allocations.
   573  		now := time.Now()
   574  		for _, task := range alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks {
   575  			ts, ok := a.TaskStates[task.Name]
   576  			if !ok {
   577  				ts = &structs.TaskState{}
   578  				a.TaskStates[task.Name] = ts
   579  			}
   580  			if ts.FinishedAt.IsZero() {
   581  				ts.FinishedAt = now
   582  			}
   583  		}
   584  	}
   585  
   586  	return a
   587  }
   588  
   589  // getClientStatus takes in the task states for a given allocation and computes
   590  // the client status and description
   591  func getClientStatus(taskStates map[string]*structs.TaskState) (status, description string) {
   592  	var pending, running, dead, failed bool
   593  	for _, state := range taskStates {
   594  		switch state.State {
   595  		case structs.TaskStateRunning:
   596  			running = true
   597  		case structs.TaskStatePending:
   598  			pending = true
   599  		case structs.TaskStateDead:
   600  			if state.Failed {
   601  				failed = true
   602  			} else {
   603  				dead = true
   604  			}
   605  		}
   606  	}
   607  
   608  	// Determine the alloc status
   609  	if failed {
   610  		return structs.AllocClientStatusFailed, "Failed tasks"
   611  	} else if running {
   612  		return structs.AllocClientStatusRunning, "Tasks are running"
   613  	} else if pending {
   614  		return structs.AllocClientStatusPending, "No tasks have started"
   615  	} else if dead {
   616  		return structs.AllocClientStatusComplete, "All tasks have completed"
   617  	}
   618  
   619  	return "", ""
   620  }
   621  
   622  // SetClientStatus is a helper for forcing a specific client
   623  // status on the alloc runner. This is used during restore errors
   624  // when the task state can't be restored.
   625  func (ar *allocRunner) SetClientStatus(clientStatus string) {
   626  	ar.stateLock.Lock()
   627  	defer ar.stateLock.Unlock()
   628  	ar.state.ClientStatus = clientStatus
   629  }
   630  
   631  // AllocState returns a copy of allocation state including a snapshot of task
   632  // states.
   633  func (ar *allocRunner) AllocState() *state.State {
   634  	ar.stateLock.RLock()
   635  	state := ar.state.Copy()
   636  	ar.stateLock.RUnlock()
   637  
   638  	// If TaskStateUpdated has not been called yet, ar.state.TaskStates
   639  	// won't be set as it is not the canonical source of TaskStates.
   640  	if len(state.TaskStates) == 0 {
   641  		ar.state.TaskStates = make(map[string]*structs.TaskState, len(ar.tasks))
   642  		for k, tr := range ar.tasks {
   643  			state.TaskStates[k] = tr.TaskState()
   644  		}
   645  	}
   646  
   647  	// Generate alloc to get other state fields
   648  	alloc := ar.clientAlloc(state.TaskStates)
   649  	state.ClientStatus = alloc.ClientStatus
   650  	state.ClientDescription = alloc.ClientDescription
   651  	state.DeploymentStatus = alloc.DeploymentStatus
   652  
   653  	return state
   654  }
   655  
   656  // Update asyncronously updates the running allocation with a new version
   657  // received from the server.
   658  // When processing a new update, we will first attempt to drain stale updates
   659  // from the queue, before appending the new one.
   660  func (ar *allocRunner) Update(update *structs.Allocation) {
   661  	select {
   662  	// Drain queued update from the channel if possible, and check the modify
   663  	// index
   664  	case oldUpdate := <-ar.allocUpdatedCh:
   665  		// If the old update is newer than the replacement, then skip the new one
   666  		// and return. This case shouldn't happen, but may in the case of a bug
   667  		// elsewhere inside the system.
   668  		if oldUpdate.AllocModifyIndex > update.AllocModifyIndex {
   669  			ar.logger.Debug("Discarding allocation update due to newer alloc revision in queue",
   670  				"old_modify_index", oldUpdate.AllocModifyIndex,
   671  				"new_modify_index", update.AllocModifyIndex)
   672  			ar.allocUpdatedCh <- oldUpdate
   673  			return
   674  		} else {
   675  			ar.logger.Debug("Discarding allocation update",
   676  				"skipped_modify_index", oldUpdate.AllocModifyIndex,
   677  				"new_modify_index", update.AllocModifyIndex)
   678  		}
   679  	case <-ar.waitCh:
   680  		ar.logger.Trace("AllocRunner has terminated, skipping alloc update",
   681  			"modify_index", update.AllocModifyIndex)
   682  		return
   683  	default:
   684  	}
   685  
   686  	// Queue the new update
   687  	ar.allocUpdatedCh <- update
   688  }
   689  
   690  func (ar *allocRunner) handleAllocUpdates() {
   691  	for {
   692  		select {
   693  		case update := <-ar.allocUpdatedCh:
   694  			ar.handleAllocUpdate(update)
   695  		case <-ar.waitCh:
   696  			return
   697  		}
   698  	}
   699  }
   700  
   701  // This method sends the updated alloc to Run for serially processing updates.
   702  // If there is already a pending update it will be discarded and replaced by
   703  // the latest update.
   704  func (ar *allocRunner) handleAllocUpdate(update *structs.Allocation) {
   705  	// Detect Stop updates
   706  	stopping := !ar.Alloc().TerminalStatus() && update.TerminalStatus()
   707  
   708  	// Update ar.alloc
   709  	ar.setAlloc(update)
   710  
   711  	// Run update hooks if not stopping or dead
   712  	if !update.TerminalStatus() {
   713  		if err := ar.update(update); err != nil {
   714  			ar.logger.Error("error running update hooks", "error", err)
   715  		}
   716  
   717  	}
   718  
   719  	// Update task runners
   720  	for _, tr := range ar.tasks {
   721  		tr.Update(update)
   722  	}
   723  
   724  	// If alloc is being terminated, kill all tasks, leader first
   725  	if stopping {
   726  		ar.killTasks()
   727  	}
   728  
   729  }
   730  
   731  func (ar *allocRunner) Listener() *cstructs.AllocListener {
   732  	return ar.allocBroadcaster.Listen()
   733  }
   734  
   735  func (ar *allocRunner) destroyImpl() {
   736  	// Stop any running tasks and persist states in case the client is
   737  	// shutdown before Destroy finishes.
   738  	states := ar.killTasks()
   739  	calloc := ar.clientAlloc(states)
   740  	ar.stateUpdater.AllocStateUpdated(calloc)
   741  
   742  	// Wait for tasks to exit and postrun hooks to finish
   743  	<-ar.waitCh
   744  
   745  	// Run destroy hooks
   746  	if err := ar.destroy(); err != nil {
   747  		ar.logger.Warn("error running destroy hooks", "error", err)
   748  	}
   749  
   750  	// Wait for task state update handler to exit before removing local
   751  	// state if Run() ran at all.
   752  	<-ar.taskStateUpdateHandlerCh
   753  
   754  	// Cleanup state db
   755  	if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil {
   756  		ar.logger.Warn("failed to delete allocation state", "error", err)
   757  	}
   758  
   759  	// Mark alloc as destroyed
   760  	ar.destroyedLock.Lock()
   761  
   762  	if !ar.shutdown {
   763  		ar.shutdown = true
   764  		close(ar.shutdownCh)
   765  	}
   766  
   767  	ar.destroyed = true
   768  	close(ar.destroyCh)
   769  
   770  	ar.destroyedLock.Unlock()
   771  }
   772  
   773  // Destroy the alloc runner by stopping it if it is still running and cleaning
   774  // up all of its resources.
   775  //
   776  // This method is safe for calling concurrently with Run() and will cause it to
   777  // exit (thus closing WaitCh).
   778  // When the destroy action is completed, it will close DestroyCh().
   779  func (ar *allocRunner) Destroy() {
   780  	ar.destroyedLock.Lock()
   781  	defer ar.destroyedLock.Unlock()
   782  
   783  	if ar.destroyed {
   784  		// Only destroy once
   785  		return
   786  	}
   787  
   788  	if ar.destroyLaunched {
   789  		// Only dispatch a destroy once
   790  		return
   791  	}
   792  
   793  	ar.destroyLaunched = true
   794  
   795  	// Synchronize calls to shutdown/destroy
   796  	if ar.shutdownLaunched {
   797  		go func() {
   798  			ar.logger.Debug("Waiting for shutdown before destroying runner")
   799  			<-ar.shutdownCh
   800  			ar.destroyImpl()
   801  		}()
   802  
   803  		return
   804  	}
   805  
   806  	go ar.destroyImpl()
   807  }
   808  
   809  // IsDestroyed returns true if the alloc runner has been destroyed (stopped and
   810  // garbage collected).
   811  //
   812  // This method is safe for calling concurrently with Run(). Callers must
   813  // receive on WaitCh() to block until alloc runner has stopped and been
   814  // destroyed.
   815  func (ar *allocRunner) IsDestroyed() bool {
   816  	ar.destroyedLock.Lock()
   817  	defer ar.destroyedLock.Unlock()
   818  	return ar.destroyed
   819  }
   820  
   821  // IsWaiting returns true if the alloc runner is waiting for its previous
   822  // allocation to terminate.
   823  //
   824  // This method is safe for calling concurrently with Run().
   825  func (ar *allocRunner) IsWaiting() bool {
   826  	return ar.prevAllocWatcher.IsWaiting()
   827  }
   828  
   829  // DestroyCh is a channel that is closed when an allocrunner is closed due to
   830  // an explicit call to Destroy().
   831  func (ar *allocRunner) DestroyCh() <-chan struct{} {
   832  	return ar.destroyCh
   833  }
   834  
   835  // ShutdownCh is a channel that is closed when an allocrunner is closed due to
   836  // either an explicit call to Shutdown(), or Destroy().
   837  func (ar *allocRunner) ShutdownCh() <-chan struct{} {
   838  	return ar.shutdownCh
   839  }
   840  
   841  // Shutdown AllocRunner gracefully. Asynchronously shuts down all TaskRunners.
   842  // Tasks are unaffected and may be restored.
   843  // When the destroy action is completed, it will close ShutdownCh().
   844  func (ar *allocRunner) Shutdown() {
   845  	ar.destroyedLock.Lock()
   846  	defer ar.destroyedLock.Unlock()
   847  
   848  	// Destroy is a superset of Shutdown so there's nothing to do if this
   849  	// has already been destroyed.
   850  	if ar.destroyed {
   851  		return
   852  	}
   853  
   854  	// Destroy is a superset of Shutdown so if it's been marked for destruction,
   855  	// don't try and shutdown in parallel. If shutdown has been launched, don't
   856  	// try again.
   857  	if ar.destroyLaunched || ar.shutdownLaunched {
   858  		return
   859  	}
   860  
   861  	ar.shutdownLaunched = true
   862  
   863  	go func() {
   864  		ar.logger.Trace("shutting down")
   865  
   866  		// Shutdown tasks gracefully if they were run
   867  		wg := sync.WaitGroup{}
   868  		for _, tr := range ar.tasks {
   869  			wg.Add(1)
   870  			go func(tr *taskrunner.TaskRunner) {
   871  				tr.Shutdown()
   872  				wg.Done()
   873  			}(tr)
   874  		}
   875  		wg.Wait()
   876  
   877  		// Wait for Run to exit
   878  		<-ar.waitCh
   879  
   880  		// Run shutdown hooks
   881  		ar.shutdownHooks()
   882  
   883  		// Wait for updater to finish its final run
   884  		<-ar.taskStateUpdateHandlerCh
   885  
   886  		ar.destroyedLock.Lock()
   887  		ar.shutdown = true
   888  		close(ar.shutdownCh)
   889  		ar.destroyedLock.Unlock()
   890  	}()
   891  }
   892  
   893  // IsMigrating returns true if the alloc runner is migrating data from its
   894  // previous allocation.
   895  //
   896  // This method is safe for calling concurrently with Run().
   897  func (ar *allocRunner) IsMigrating() bool {
   898  	return ar.prevAllocMigrator.IsMigrating()
   899  }
   900  
   901  func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter {
   902  	return ar
   903  }
   904  
   905  // LatestAllocStats returns the latest stats for an allocation. If taskFilter
   906  // is set, only stats for that task -- if it exists -- are returned.
   907  func (ar *allocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) {
   908  	astat := &cstructs.AllocResourceUsage{
   909  		Tasks: make(map[string]*cstructs.TaskResourceUsage, len(ar.tasks)),
   910  		ResourceUsage: &cstructs.ResourceUsage{
   911  			MemoryStats: &cstructs.MemoryStats{},
   912  			CpuStats:    &cstructs.CpuStats{},
   913  			DeviceStats: []*device.DeviceGroupStats{},
   914  		},
   915  	}
   916  
   917  	for name, tr := range ar.tasks {
   918  		if taskFilter != "" && taskFilter != name {
   919  			// Getting stats for a particular task and its not this one!
   920  			continue
   921  		}
   922  
   923  		if usage := tr.LatestResourceUsage(); usage != nil {
   924  			astat.Tasks[name] = usage
   925  			astat.ResourceUsage.Add(usage.ResourceUsage)
   926  			if usage.Timestamp > astat.Timestamp {
   927  				astat.Timestamp = usage.Timestamp
   928  			}
   929  		}
   930  	}
   931  
   932  	return astat, nil
   933  }
   934  
   935  func (ar *allocRunner) GetTaskEventHandler(taskName string) drivermanager.EventHandler {
   936  	if tr, ok := ar.tasks[taskName]; ok {
   937  		return func(ev *drivers.TaskEvent) {
   938  			tr.EmitEvent(&structs.TaskEvent{
   939  				Type:          structs.TaskDriverMessage,
   940  				Time:          ev.Timestamp.UnixNano(),
   941  				Details:       ev.Annotations,
   942  				DriverMessage: ev.Message,
   943  			})
   944  		}
   945  	}
   946  	return nil
   947  }
   948  
   949  // RestartTask signalls the task runner for the  provided task to restart.
   950  func (ar *allocRunner) RestartTask(taskName string, taskEvent *structs.TaskEvent) error {
   951  	tr, ok := ar.tasks[taskName]
   952  	if !ok {
   953  		return fmt.Errorf("Could not find task runner for task: %s", taskName)
   954  	}
   955  
   956  	return tr.Restart(context.TODO(), taskEvent, false)
   957  }
   958  
   959  // RestartAll signalls all task runners in the allocation to restart and passes
   960  // a copy of the task event to each restart event.
   961  // Returns any errors in a concatenated form.
   962  func (ar *allocRunner) RestartAll(taskEvent *structs.TaskEvent) error {
   963  	var err *multierror.Error
   964  
   965  	for tn := range ar.tasks {
   966  		rerr := ar.RestartTask(tn, taskEvent.Copy())
   967  		if rerr != nil {
   968  			err = multierror.Append(err, rerr)
   969  		}
   970  	}
   971  
   972  	return err.ErrorOrNil()
   973  }
   974  
   975  // Signal sends a signal request to task runners inside an allocation. If the
   976  // taskName is empty, then it is sent to all tasks.
   977  func (ar *allocRunner) Signal(taskName, signal string) error {
   978  	event := structs.NewTaskEvent(structs.TaskSignaling).SetSignalText(signal)
   979  
   980  	if taskName != "" {
   981  		tr, ok := ar.tasks[taskName]
   982  		if !ok {
   983  			return fmt.Errorf("Task not found")
   984  		}
   985  
   986  		return tr.Signal(event, signal)
   987  	}
   988  
   989  	var err *multierror.Error
   990  
   991  	for tn, tr := range ar.tasks {
   992  		rerr := tr.Signal(event.Copy(), signal)
   993  		if rerr != nil {
   994  			err = multierror.Append(err, fmt.Errorf("Failed to signal task: %s, err: %v", tn, rerr))
   995  		}
   996  	}
   997  
   998  	return err.ErrorOrNil()
   999  }
  1000  
  1001  func (ar *allocRunner) GetTaskExecHandler(taskName string) drivermanager.TaskExecHandler {
  1002  	tr, ok := ar.tasks[taskName]
  1003  	if !ok {
  1004  		return nil
  1005  	}
  1006  
  1007  	return tr.TaskExecHandler()
  1008  }
  1009  
  1010  func (ar *allocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) {
  1011  	tr, ok := ar.tasks[taskName]
  1012  	if !ok {
  1013  		return nil, fmt.Errorf("task not found")
  1014  	}
  1015  
  1016  	return tr.DriverCapabilities()
  1017  }