github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/tasklifecycle/coordinator.go (about) 1 package tasklifecycle 2 3 import ( 4 "fmt" 5 "sync" 6 7 "github.com/hashicorp/go-hclog" 8 "github.com/hashicorp/nomad/nomad/structs" 9 ) 10 11 // coordinatorState represents a state of the task lifecycle Coordinator FSM. 12 type coordinatorState uint8 13 14 const ( 15 coordinatorStateInit coordinatorState = iota 16 coordinatorStatePrestart 17 coordinatorStateMain 18 coordinatorStatePoststart 19 coordinatorStateWaitAlloc 20 coordinatorStatePoststop 21 ) 22 23 func (s coordinatorState) String() string { 24 switch s { 25 case coordinatorStateInit: 26 return "init" 27 case coordinatorStatePrestart: 28 return "prestart" 29 case coordinatorStateMain: 30 return "main" 31 case coordinatorStatePoststart: 32 return "poststart" 33 case coordinatorStateWaitAlloc: 34 return "wait_alloc" 35 case coordinatorStatePoststop: 36 return "poststart" 37 } 38 panic(fmt.Sprintf("Unexpected task coordinator state %d", s)) 39 } 40 41 // lifecycleStage represents a lifecycle configuration used for task 42 // coordination. 43 // 44 // Not all possible combinations of hook X sidecar are defined, only the ones 45 // that are relevant for coordinating task initialization order. For example, a 46 // main task with sidecar set to `true` starts at the same time as a 47 // non-sidecar main task, so there is no need to treat them differently. 48 type lifecycleStage uint8 49 50 const ( 51 // lifecycleStagePrestartEphemeral are tasks with the "prestart" hook and 52 // sidecar set to "false". 53 lifecycleStagePrestartEphemeral lifecycleStage = iota 54 55 // lifecycleStagePrestartSidecar are tasks with the "prestart" hook and 56 // sidecar set to "true". 57 lifecycleStagePrestartSidecar 58 59 // lifecycleStageMain are tasks without a lifecycle or a lifecycle with an 60 // empty hook value. 61 lifecycleStageMain 62 63 // lifecycleStagePoststartEphemeral are tasks with the "poststart" hook and 64 // sidecar set to "false" 65 lifecycleStagePoststartEphemeral 66 67 // lifecycleStagePoststartSidecar are tasks with the "poststart" hook and 68 // sidecar set to "true". 69 lifecycleStagePoststartSidecar 70 71 // lifecycleStagePoststop are tasks with the "poststop" hook. 72 lifecycleStagePoststop 73 ) 74 75 // Coordinator controls when tasks with a given lifecycle configuration are 76 // allowed to start and run. 77 // 78 // It behaves like a finite state machine where each state transition blocks or 79 // allows some task lifecycle types to run. 80 type Coordinator struct { 81 logger hclog.Logger 82 83 // tasksByLifecycle is an index used to group and quickly access tasks by 84 // their lifecycle stage. 85 tasksByLifecycle map[lifecycleStage][]string 86 87 // currentState is the current state of the FSM. It must only be accessed 88 // while holding the lock. 89 currentState coordinatorState 90 currentStateLock sync.RWMutex 91 92 // gates store the gates that control each task lifecycle stage. 93 gates map[lifecycleStage]*Gate 94 } 95 96 // NewCoordinator returns a new Coordinator with all tasks initially blocked. 97 func NewCoordinator(logger hclog.Logger, tasks []*structs.Task, shutdownCh <-chan struct{}) *Coordinator { 98 c := &Coordinator{ 99 logger: logger.Named("task_coordinator"), 100 tasksByLifecycle: indexTasksByLifecycle(tasks), 101 gates: make(map[lifecycleStage]*Gate), 102 } 103 104 for lifecycle := range c.tasksByLifecycle { 105 c.gates[lifecycle] = NewGate(shutdownCh) 106 } 107 108 c.enterStateLocked(coordinatorStateInit) 109 return c 110 } 111 112 // Restart sets the Coordinator state back to "init" and is used to coordinate 113 // a full alloc restart. Since all tasks will run again they need to be pending 114 // before they are allowed to proceed. 115 func (c *Coordinator) Restart() { 116 c.currentStateLock.Lock() 117 defer c.currentStateLock.Unlock() 118 c.enterStateLocked(coordinatorStateInit) 119 } 120 121 // Restore is used to set the Coordinator FSM to the correct state when an 122 // alloc is restored. Must be called before the allocrunner is running. 123 func (c *Coordinator) Restore(states map[string]*structs.TaskState) { 124 // Skip the "init" state when restoring since the tasks were likely already 125 // running, causing the Coordinator to be stuck waiting for them to be 126 // "pending". 127 c.enterStateLocked(coordinatorStatePrestart) 128 c.TaskStateUpdated(states) 129 } 130 131 // StartConditionForTask returns a channel that is unblocked when the task is 132 // allowed to run. 133 func (c *Coordinator) StartConditionForTask(task *structs.Task) <-chan struct{} { 134 lifecycle := taskLifecycleStage(task) 135 return c.gates[lifecycle].WaitCh() 136 } 137 138 // TaskStateUpdated notifies that a task state has changed. This may cause the 139 // Coordinator to transition to another state. 140 func (c *Coordinator) TaskStateUpdated(states map[string]*structs.TaskState) { 141 c.currentStateLock.Lock() 142 defer c.currentStateLock.Unlock() 143 144 // We may be able to move directly through some states (for example, when 145 // an alloc doesn't have any prestart task we can skip the prestart state), 146 // so loop until we stabilize. 147 // This is also important when restoring an alloc since we need to find the 148 // state where FSM was last positioned. 149 for { 150 nextState := c.nextStateLocked(states) 151 if nextState == c.currentState { 152 return 153 } 154 155 c.enterStateLocked(nextState) 156 } 157 } 158 159 // nextStateLocked returns the state the FSM should transition to given its 160 // current internal state and the received states of the tasks. 161 // The currentStateLock must be held before calling this method. 162 func (c *Coordinator) nextStateLocked(states map[string]*structs.TaskState) coordinatorState { 163 164 // coordinatorStatePoststop is the terminal state of the FSM, and can be 165 // reached at any time. 166 if c.isAllocDone(states) { 167 return coordinatorStatePoststop 168 } 169 170 switch c.currentState { 171 case coordinatorStateInit: 172 if !c.isInitDone(states) { 173 return coordinatorStateInit 174 } 175 return coordinatorStatePrestart 176 177 case coordinatorStatePrestart: 178 if !c.isPrestartDone(states) { 179 return coordinatorStatePrestart 180 } 181 return coordinatorStateMain 182 183 case coordinatorStateMain: 184 if !c.isMainDone(states) { 185 return coordinatorStateMain 186 } 187 return coordinatorStatePoststart 188 189 case coordinatorStatePoststart: 190 if !c.isPoststartDone(states) { 191 return coordinatorStatePoststart 192 } 193 return coordinatorStateWaitAlloc 194 195 case coordinatorStateWaitAlloc: 196 if !c.isAllocDone(states) { 197 return coordinatorStateWaitAlloc 198 } 199 return coordinatorStatePoststop 200 201 case coordinatorStatePoststop: 202 return coordinatorStatePoststop 203 } 204 205 // If the code reaches here it's a programming error, since the switch 206 // statement should cover all possible states and return the next state. 207 panic(fmt.Sprintf("unexpected state %s", c.currentState)) 208 } 209 210 // enterStateLocked updates the current state of the Coordinator FSM and 211 // executes any action necessary for the state transition. 212 // The currentStateLock must be held before calling this method. 213 func (c *Coordinator) enterStateLocked(state coordinatorState) { 214 c.logger.Trace("state transition", "from", c.currentState, "to", state) 215 216 switch state { 217 case coordinatorStateInit: 218 c.block(lifecycleStagePrestartEphemeral) 219 c.block(lifecycleStagePrestartSidecar) 220 c.block(lifecycleStageMain) 221 c.block(lifecycleStagePoststartEphemeral) 222 c.block(lifecycleStagePoststartSidecar) 223 c.block(lifecycleStagePoststop) 224 225 case coordinatorStatePrestart: 226 c.block(lifecycleStageMain) 227 c.block(lifecycleStagePoststartEphemeral) 228 c.block(lifecycleStagePoststartSidecar) 229 c.block(lifecycleStagePoststop) 230 231 c.allow(lifecycleStagePrestartEphemeral) 232 c.allow(lifecycleStagePrestartSidecar) 233 234 case coordinatorStateMain: 235 c.block(lifecycleStagePrestartEphemeral) 236 c.block(lifecycleStagePoststartEphemeral) 237 c.block(lifecycleStagePoststartSidecar) 238 c.block(lifecycleStagePoststop) 239 240 c.allow(lifecycleStagePrestartSidecar) 241 c.allow(lifecycleStageMain) 242 243 case coordinatorStatePoststart: 244 c.block(lifecycleStagePrestartEphemeral) 245 c.block(lifecycleStagePoststop) 246 247 c.allow(lifecycleStagePrestartSidecar) 248 c.allow(lifecycleStageMain) 249 c.allow(lifecycleStagePoststartEphemeral) 250 c.allow(lifecycleStagePoststartSidecar) 251 252 case coordinatorStateWaitAlloc: 253 c.block(lifecycleStagePrestartEphemeral) 254 c.block(lifecycleStagePoststartEphemeral) 255 c.block(lifecycleStagePoststop) 256 257 c.allow(lifecycleStagePrestartSidecar) 258 c.allow(lifecycleStageMain) 259 c.allow(lifecycleStagePoststartSidecar) 260 261 case coordinatorStatePoststop: 262 c.block(lifecycleStagePrestartEphemeral) 263 c.block(lifecycleStagePrestartSidecar) 264 c.block(lifecycleStageMain) 265 c.block(lifecycleStagePoststartEphemeral) 266 c.block(lifecycleStagePoststartSidecar) 267 268 c.allow(lifecycleStagePoststop) 269 } 270 271 c.currentState = state 272 } 273 274 // isInitDone returns true when the following conditions are met: 275 // - all tasks are in the "pending" state. 276 func (c *Coordinator) isInitDone(states map[string]*structs.TaskState) bool { 277 for _, task := range states { 278 if task.State != structs.TaskStatePending { 279 return false 280 } 281 } 282 return true 283 } 284 285 // isPrestartDone returns true when the following conditions are met: 286 // - there is at least one prestart task 287 // - all ephemeral prestart tasks are successful. 288 // - no ephemeral prestart task has failed. 289 // - all prestart sidecar tasks are running. 290 func (c *Coordinator) isPrestartDone(states map[string]*structs.TaskState) bool { 291 if !c.hasPrestart() { 292 return true 293 } 294 295 for _, task := range c.tasksByLifecycle[lifecycleStagePrestartEphemeral] { 296 if !states[task].Successful() { 297 return false 298 } 299 } 300 for _, task := range c.tasksByLifecycle[lifecycleStagePrestartSidecar] { 301 if states[task].State != structs.TaskStateRunning { 302 return false 303 } 304 } 305 return true 306 } 307 308 // isMainDone returns true when the following conditions are met: 309 // - there is at least one main task. 310 // - all main tasks are no longer "pending". 311 func (c *Coordinator) isMainDone(states map[string]*structs.TaskState) bool { 312 if !c.hasMain() { 313 return true 314 } 315 316 for _, task := range c.tasksByLifecycle[lifecycleStageMain] { 317 if states[task].State == structs.TaskStatePending { 318 return false 319 } 320 } 321 return true 322 } 323 324 // isPoststartDone returns true when the following conditions are met: 325 // - there is at least one poststart task. 326 // - all ephemeral poststart tasks are in the "dead" state. 327 func (c *Coordinator) isPoststartDone(states map[string]*structs.TaskState) bool { 328 if !c.hasPoststart() { 329 return true 330 } 331 332 for _, task := range c.tasksByLifecycle[lifecycleStagePoststartEphemeral] { 333 if states[task].State != structs.TaskStateDead { 334 return false 335 } 336 } 337 return true 338 } 339 340 // isAllocDone returns true when the following conditions are met: 341 // - all non-poststop tasks are in the "dead" state. 342 func (c *Coordinator) isAllocDone(states map[string]*structs.TaskState) bool { 343 for lifecycle, tasks := range c.tasksByLifecycle { 344 if lifecycle == lifecycleStagePoststop { 345 continue 346 } 347 348 for _, task := range tasks { 349 if states[task].State != structs.TaskStateDead { 350 return false 351 } 352 } 353 } 354 return true 355 } 356 357 func (c *Coordinator) hasPrestart() bool { 358 return len(c.tasksByLifecycle[lifecycleStagePrestartEphemeral])+ 359 len(c.tasksByLifecycle[lifecycleStagePrestartSidecar]) > 0 360 } 361 362 func (c *Coordinator) hasMain() bool { 363 return len(c.tasksByLifecycle[lifecycleStageMain]) > 0 364 } 365 366 func (c *Coordinator) hasPoststart() bool { 367 return len(c.tasksByLifecycle[lifecycleStagePoststartEphemeral])+ 368 len(c.tasksByLifecycle[lifecycleStagePoststartSidecar]) > 0 369 } 370 371 func (c *Coordinator) hasPoststop() bool { 372 return len(c.tasksByLifecycle[lifecycleStagePoststop]) > 0 373 } 374 375 // block is used to block the execution of tasks in the given lifecycle stage. 376 func (c *Coordinator) block(lifecycle lifecycleStage) { 377 gate := c.gates[lifecycle] 378 if gate != nil { 379 gate.Close() 380 } 381 } 382 383 // allows is used to allow the execution of tasks in the given lifecycle stage. 384 func (c *Coordinator) allow(lifecycle lifecycleStage) { 385 gate := c.gates[lifecycle] 386 if gate != nil { 387 gate.Open() 388 } 389 } 390 391 // indexTasksByLifecycle generates a map that groups tasks by their lifecycle 392 // configuration. This makes it easier to retrieve tasks by these groups or to 393 // determine if a task has a certain lifecycle configuration. 394 func indexTasksByLifecycle(tasks []*structs.Task) map[lifecycleStage][]string { 395 index := make(map[lifecycleStage][]string) 396 397 for _, task := range tasks { 398 lifecycle := taskLifecycleStage(task) 399 400 if _, ok := index[lifecycle]; !ok { 401 index[lifecycle] = []string{} 402 } 403 index[lifecycle] = append(index[lifecycle], task.Name) 404 } 405 406 return index 407 } 408 409 // taskLifecycleStage returns the relevant lifecycle stage for a given task. 410 func taskLifecycleStage(task *structs.Task) lifecycleStage { 411 if task.IsPrestart() { 412 if task.Lifecycle.Sidecar { 413 return lifecycleStagePrestartSidecar 414 } 415 return lifecycleStagePrestartEphemeral 416 } else if task.IsPoststart() { 417 if task.Lifecycle.Sidecar { 418 return lifecycleStagePoststartSidecar 419 } 420 return lifecycleStagePoststartEphemeral 421 } else if task.IsPoststop() { 422 return lifecycleStagePoststop 423 } 424 425 // Assume task is "main" by default. 426 return lifecycleStageMain 427 }