github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/tasklifecycle/coordinator.go (about)

     1  package tasklifecycle
     2  
     3  import (
     4  	"fmt"
     5  	"sync"
     6  
     7  	"github.com/hashicorp/go-hclog"
     8  	"github.com/hashicorp/nomad/nomad/structs"
     9  )
    10  
    11  // coordinatorState represents a state of the task lifecycle Coordinator FSM.
    12  type coordinatorState uint8
    13  
    14  const (
    15  	coordinatorStateInit coordinatorState = iota
    16  	coordinatorStatePrestart
    17  	coordinatorStateMain
    18  	coordinatorStatePoststart
    19  	coordinatorStateWaitAlloc
    20  	coordinatorStatePoststop
    21  )
    22  
    23  func (s coordinatorState) String() string {
    24  	switch s {
    25  	case coordinatorStateInit:
    26  		return "init"
    27  	case coordinatorStatePrestart:
    28  		return "prestart"
    29  	case coordinatorStateMain:
    30  		return "main"
    31  	case coordinatorStatePoststart:
    32  		return "poststart"
    33  	case coordinatorStateWaitAlloc:
    34  		return "wait_alloc"
    35  	case coordinatorStatePoststop:
    36  		return "poststart"
    37  	}
    38  	panic(fmt.Sprintf("Unexpected task coordinator state %d", s))
    39  }
    40  
    41  // lifecycleStage represents a lifecycle configuration used for task
    42  // coordination.
    43  //
    44  // Not all possible combinations of hook X sidecar are defined, only the ones
    45  // that are relevant for coordinating task initialization order. For example, a
    46  // main task with sidecar set to `true` starts at the same time as a
    47  // non-sidecar main task, so there is no need to treat them differently.
    48  type lifecycleStage uint8
    49  
    50  const (
    51  	// lifecycleStagePrestartEphemeral are tasks with the "prestart" hook and
    52  	// sidecar set to "false".
    53  	lifecycleStagePrestartEphemeral lifecycleStage = iota
    54  
    55  	// lifecycleStagePrestartSidecar are tasks with the "prestart" hook and
    56  	// sidecar set to "true".
    57  	lifecycleStagePrestartSidecar
    58  
    59  	// lifecycleStageMain are tasks without a lifecycle or a lifecycle with an
    60  	// empty hook value.
    61  	lifecycleStageMain
    62  
    63  	// lifecycleStagePoststartEphemeral are tasks with the "poststart" hook and
    64  	// sidecar set to "false"
    65  	lifecycleStagePoststartEphemeral
    66  
    67  	// lifecycleStagePoststartSidecar are tasks with the "poststart" hook and
    68  	// sidecar set to "true".
    69  	lifecycleStagePoststartSidecar
    70  
    71  	// lifecycleStagePoststop are tasks with the "poststop" hook.
    72  	lifecycleStagePoststop
    73  )
    74  
    75  // Coordinator controls when tasks with a given lifecycle configuration are
    76  // allowed to start and run.
    77  //
    78  // It behaves like a finite state machine where each state transition blocks or
    79  // allows some task lifecycle types to run.
    80  type Coordinator struct {
    81  	logger hclog.Logger
    82  
    83  	// tasksByLifecycle is an index used to group and quickly access tasks by
    84  	// their lifecycle stage.
    85  	tasksByLifecycle map[lifecycleStage][]string
    86  
    87  	// currentState is the current state of the FSM. It must only be accessed
    88  	// while holding the lock.
    89  	currentState     coordinatorState
    90  	currentStateLock sync.RWMutex
    91  
    92  	// gates store the gates that control each task lifecycle stage.
    93  	gates map[lifecycleStage]*Gate
    94  }
    95  
    96  // NewCoordinator returns a new Coordinator with all tasks initially blocked.
    97  func NewCoordinator(logger hclog.Logger, tasks []*structs.Task, shutdownCh <-chan struct{}) *Coordinator {
    98  	c := &Coordinator{
    99  		logger:           logger.Named("task_coordinator"),
   100  		tasksByLifecycle: indexTasksByLifecycle(tasks),
   101  		gates:            make(map[lifecycleStage]*Gate),
   102  	}
   103  
   104  	for lifecycle := range c.tasksByLifecycle {
   105  		c.gates[lifecycle] = NewGate(shutdownCh)
   106  	}
   107  
   108  	c.enterStateLocked(coordinatorStateInit)
   109  	return c
   110  }
   111  
   112  // Restart sets the Coordinator state back to "init" and is used to coordinate
   113  // a full alloc restart. Since all tasks will run again they need to be pending
   114  // before they are allowed to proceed.
   115  func (c *Coordinator) Restart() {
   116  	c.currentStateLock.Lock()
   117  	defer c.currentStateLock.Unlock()
   118  	c.enterStateLocked(coordinatorStateInit)
   119  }
   120  
   121  // Restore is used to set the Coordinator FSM to the correct state when an
   122  // alloc is restored. Must be called before the allocrunner is running.
   123  func (c *Coordinator) Restore(states map[string]*structs.TaskState) {
   124  	// Skip the "init" state when restoring since the tasks were likely already
   125  	// running, causing the Coordinator to be stuck waiting for them to be
   126  	// "pending".
   127  	c.enterStateLocked(coordinatorStatePrestart)
   128  	c.TaskStateUpdated(states)
   129  }
   130  
   131  // StartConditionForTask returns a channel that is unblocked when the task is
   132  // allowed to run.
   133  func (c *Coordinator) StartConditionForTask(task *structs.Task) <-chan struct{} {
   134  	lifecycle := taskLifecycleStage(task)
   135  	return c.gates[lifecycle].WaitCh()
   136  }
   137  
   138  // TaskStateUpdated notifies that a task state has changed. This may cause the
   139  // Coordinator to transition to another state.
   140  func (c *Coordinator) TaskStateUpdated(states map[string]*structs.TaskState) {
   141  	c.currentStateLock.Lock()
   142  	defer c.currentStateLock.Unlock()
   143  
   144  	// We may be able to move directly through some states (for example, when
   145  	// an alloc doesn't have any prestart task we can skip the prestart state),
   146  	// so loop until we stabilize.
   147  	// This is also important when restoring an alloc since we need to find the
   148  	// state where FSM was last positioned.
   149  	for {
   150  		nextState := c.nextStateLocked(states)
   151  		if nextState == c.currentState {
   152  			return
   153  		}
   154  
   155  		c.enterStateLocked(nextState)
   156  	}
   157  }
   158  
   159  // nextStateLocked returns the state the FSM should transition to given its
   160  // current internal state and the received states of the tasks.
   161  // The currentStateLock must be held before calling this method.
   162  func (c *Coordinator) nextStateLocked(states map[string]*structs.TaskState) coordinatorState {
   163  
   164  	// coordinatorStatePoststop is the terminal state of the FSM, and can be
   165  	// reached at any time.
   166  	if c.isAllocDone(states) {
   167  		return coordinatorStatePoststop
   168  	}
   169  
   170  	switch c.currentState {
   171  	case coordinatorStateInit:
   172  		if !c.isInitDone(states) {
   173  			return coordinatorStateInit
   174  		}
   175  		return coordinatorStatePrestart
   176  
   177  	case coordinatorStatePrestart:
   178  		if !c.isPrestartDone(states) {
   179  			return coordinatorStatePrestart
   180  		}
   181  		return coordinatorStateMain
   182  
   183  	case coordinatorStateMain:
   184  		if !c.isMainDone(states) {
   185  			return coordinatorStateMain
   186  		}
   187  		return coordinatorStatePoststart
   188  
   189  	case coordinatorStatePoststart:
   190  		if !c.isPoststartDone(states) {
   191  			return coordinatorStatePoststart
   192  		}
   193  		return coordinatorStateWaitAlloc
   194  
   195  	case coordinatorStateWaitAlloc:
   196  		if !c.isAllocDone(states) {
   197  			return coordinatorStateWaitAlloc
   198  		}
   199  		return coordinatorStatePoststop
   200  
   201  	case coordinatorStatePoststop:
   202  		return coordinatorStatePoststop
   203  	}
   204  
   205  	// If the code reaches here it's a programming error, since the switch
   206  	// statement should cover all possible states and return the next state.
   207  	panic(fmt.Sprintf("unexpected state %s", c.currentState))
   208  }
   209  
   210  // enterStateLocked updates the current state of the Coordinator FSM and
   211  // executes any action necessary for the state transition.
   212  // The currentStateLock must be held before calling this method.
   213  func (c *Coordinator) enterStateLocked(state coordinatorState) {
   214  	c.logger.Trace("state transition", "from", c.currentState, "to", state)
   215  
   216  	switch state {
   217  	case coordinatorStateInit:
   218  		c.block(lifecycleStagePrestartEphemeral)
   219  		c.block(lifecycleStagePrestartSidecar)
   220  		c.block(lifecycleStageMain)
   221  		c.block(lifecycleStagePoststartEphemeral)
   222  		c.block(lifecycleStagePoststartSidecar)
   223  		c.block(lifecycleStagePoststop)
   224  
   225  	case coordinatorStatePrestart:
   226  		c.block(lifecycleStageMain)
   227  		c.block(lifecycleStagePoststartEphemeral)
   228  		c.block(lifecycleStagePoststartSidecar)
   229  		c.block(lifecycleStagePoststop)
   230  
   231  		c.allow(lifecycleStagePrestartEphemeral)
   232  		c.allow(lifecycleStagePrestartSidecar)
   233  
   234  	case coordinatorStateMain:
   235  		c.block(lifecycleStagePrestartEphemeral)
   236  		c.block(lifecycleStagePoststartEphemeral)
   237  		c.block(lifecycleStagePoststartSidecar)
   238  		c.block(lifecycleStagePoststop)
   239  
   240  		c.allow(lifecycleStagePrestartSidecar)
   241  		c.allow(lifecycleStageMain)
   242  
   243  	case coordinatorStatePoststart:
   244  		c.block(lifecycleStagePrestartEphemeral)
   245  		c.block(lifecycleStagePoststop)
   246  
   247  		c.allow(lifecycleStagePrestartSidecar)
   248  		c.allow(lifecycleStageMain)
   249  		c.allow(lifecycleStagePoststartEphemeral)
   250  		c.allow(lifecycleStagePoststartSidecar)
   251  
   252  	case coordinatorStateWaitAlloc:
   253  		c.block(lifecycleStagePrestartEphemeral)
   254  		c.block(lifecycleStagePoststartEphemeral)
   255  		c.block(lifecycleStagePoststop)
   256  
   257  		c.allow(lifecycleStagePrestartSidecar)
   258  		c.allow(lifecycleStageMain)
   259  		c.allow(lifecycleStagePoststartSidecar)
   260  
   261  	case coordinatorStatePoststop:
   262  		c.block(lifecycleStagePrestartEphemeral)
   263  		c.block(lifecycleStagePrestartSidecar)
   264  		c.block(lifecycleStageMain)
   265  		c.block(lifecycleStagePoststartEphemeral)
   266  		c.block(lifecycleStagePoststartSidecar)
   267  
   268  		c.allow(lifecycleStagePoststop)
   269  	}
   270  
   271  	c.currentState = state
   272  }
   273  
   274  // isInitDone returns true when the following conditions are met:
   275  //   - all tasks are in the "pending" state.
   276  func (c *Coordinator) isInitDone(states map[string]*structs.TaskState) bool {
   277  	for _, task := range states {
   278  		if task.State != structs.TaskStatePending {
   279  			return false
   280  		}
   281  	}
   282  	return true
   283  }
   284  
   285  // isPrestartDone returns true when the following conditions are met:
   286  //   - there is at least one prestart task
   287  //   - all ephemeral prestart tasks are successful.
   288  //   - no ephemeral prestart task has failed.
   289  //   - all prestart sidecar tasks are running.
   290  func (c *Coordinator) isPrestartDone(states map[string]*structs.TaskState) bool {
   291  	if !c.hasPrestart() {
   292  		return true
   293  	}
   294  
   295  	for _, task := range c.tasksByLifecycle[lifecycleStagePrestartEphemeral] {
   296  		if !states[task].Successful() {
   297  			return false
   298  		}
   299  	}
   300  	for _, task := range c.tasksByLifecycle[lifecycleStagePrestartSidecar] {
   301  		if states[task].State != structs.TaskStateRunning {
   302  			return false
   303  		}
   304  	}
   305  	return true
   306  }
   307  
   308  // isMainDone returns true when the following conditions are met:
   309  //   - there is at least one main task.
   310  //   - all main tasks are no longer "pending".
   311  func (c *Coordinator) isMainDone(states map[string]*structs.TaskState) bool {
   312  	if !c.hasMain() {
   313  		return true
   314  	}
   315  
   316  	for _, task := range c.tasksByLifecycle[lifecycleStageMain] {
   317  		if states[task].State == structs.TaskStatePending {
   318  			return false
   319  		}
   320  	}
   321  	return true
   322  }
   323  
   324  // isPoststartDone returns true when the following conditions are met:
   325  //   - there is at least one poststart task.
   326  //   - all ephemeral poststart tasks are in the "dead" state.
   327  func (c *Coordinator) isPoststartDone(states map[string]*structs.TaskState) bool {
   328  	if !c.hasPoststart() {
   329  		return true
   330  	}
   331  
   332  	for _, task := range c.tasksByLifecycle[lifecycleStagePoststartEphemeral] {
   333  		if states[task].State != structs.TaskStateDead {
   334  			return false
   335  		}
   336  	}
   337  	return true
   338  }
   339  
   340  // isAllocDone returns true when the following conditions are met:
   341  //   - all non-poststop tasks are in the "dead" state.
   342  func (c *Coordinator) isAllocDone(states map[string]*structs.TaskState) bool {
   343  	for lifecycle, tasks := range c.tasksByLifecycle {
   344  		if lifecycle == lifecycleStagePoststop {
   345  			continue
   346  		}
   347  
   348  		for _, task := range tasks {
   349  			if states[task].State != structs.TaskStateDead {
   350  				return false
   351  			}
   352  		}
   353  	}
   354  	return true
   355  }
   356  
   357  func (c *Coordinator) hasPrestart() bool {
   358  	return len(c.tasksByLifecycle[lifecycleStagePrestartEphemeral])+
   359  		len(c.tasksByLifecycle[lifecycleStagePrestartSidecar]) > 0
   360  }
   361  
   362  func (c *Coordinator) hasMain() bool {
   363  	return len(c.tasksByLifecycle[lifecycleStageMain]) > 0
   364  }
   365  
   366  func (c *Coordinator) hasPoststart() bool {
   367  	return len(c.tasksByLifecycle[lifecycleStagePoststartEphemeral])+
   368  		len(c.tasksByLifecycle[lifecycleStagePoststartSidecar]) > 0
   369  }
   370  
   371  func (c *Coordinator) hasPoststop() bool {
   372  	return len(c.tasksByLifecycle[lifecycleStagePoststop]) > 0
   373  }
   374  
   375  // block is used to block the execution of tasks in the given lifecycle stage.
   376  func (c *Coordinator) block(lifecycle lifecycleStage) {
   377  	gate := c.gates[lifecycle]
   378  	if gate != nil {
   379  		gate.Close()
   380  	}
   381  }
   382  
   383  // allows is used to allow the execution of tasks in the given lifecycle stage.
   384  func (c *Coordinator) allow(lifecycle lifecycleStage) {
   385  	gate := c.gates[lifecycle]
   386  	if gate != nil {
   387  		gate.Open()
   388  	}
   389  }
   390  
   391  // indexTasksByLifecycle generates a map that groups tasks by their lifecycle
   392  // configuration. This makes it easier to retrieve tasks by these groups or to
   393  // determine if a task has a certain lifecycle configuration.
   394  func indexTasksByLifecycle(tasks []*structs.Task) map[lifecycleStage][]string {
   395  	index := make(map[lifecycleStage][]string)
   396  
   397  	for _, task := range tasks {
   398  		lifecycle := taskLifecycleStage(task)
   399  
   400  		if _, ok := index[lifecycle]; !ok {
   401  			index[lifecycle] = []string{}
   402  		}
   403  		index[lifecycle] = append(index[lifecycle], task.Name)
   404  	}
   405  
   406  	return index
   407  }
   408  
   409  // taskLifecycleStage returns the relevant lifecycle stage for a given task.
   410  func taskLifecycleStage(task *structs.Task) lifecycleStage {
   411  	if task.IsPrestart() {
   412  		if task.Lifecycle.Sidecar {
   413  			return lifecycleStagePrestartSidecar
   414  		}
   415  		return lifecycleStagePrestartEphemeral
   416  	} else if task.IsPoststart() {
   417  		if task.Lifecycle.Sidecar {
   418  			return lifecycleStagePoststartSidecar
   419  		}
   420  		return lifecycleStagePoststartEphemeral
   421  	} else if task.IsPoststop() {
   422  		return lifecycleStagePoststop
   423  	}
   424  
   425  	// Assume task is "main" by default.
   426  	return lifecycleStageMain
   427  }