github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/taskrunner/restarts/restarts.go (about) 1 package restarts 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/nomad/plugins/drivers" 11 ) 12 13 const ( 14 // jitter is the percent of jitter added to restart delays. 15 jitter = 0.25 16 17 ReasonNoRestartsAllowed = "Policy allows no restarts" 18 ReasonUnrecoverableError = "Error was unrecoverable" 19 ReasonWithinPolicy = "Restart within policy" 20 ReasonDelay = "Exceeded allowed attempts, applying a delay" 21 ) 22 23 func NewRestartTracker(policy *structs.RestartPolicy, jobType string, tlc *structs.TaskLifecycleConfig) *RestartTracker { 24 onSuccess := true 25 26 // Batch & SysBatch jobs should not restart if they exit successfully 27 if jobType == structs.JobTypeBatch || jobType == structs.JobTypeSysBatch { 28 onSuccess = false 29 } 30 31 // Prestart sidecars should get restarted on success 32 if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPrestart { 33 onSuccess = tlc.Sidecar 34 } 35 36 // Poststart sidecars should get restarted on success 37 if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPoststart { 38 onSuccess = tlc.Sidecar 39 } 40 41 // Poststop should never be restarted on success 42 if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPoststop { 43 onSuccess = false 44 } 45 46 return &RestartTracker{ 47 startTime: time.Now(), 48 onSuccess: onSuccess, 49 policy: policy, 50 rand: rand.New(rand.NewSource(time.Now().Unix())), 51 } 52 } 53 54 type RestartTracker struct { 55 exitRes *drivers.ExitResult 56 startErr error 57 killed bool // Whether the task has been killed 58 restartTriggered bool // Whether the task has been signalled to be restarted 59 failure bool // Whether a failure triggered the restart 60 count int // Current number of attempts. 61 onSuccess bool // Whether to restart on successful exit code. 62 startTime time.Time // When the interval began 63 reason string // The reason for the last state 64 policy *structs.RestartPolicy 65 rand *rand.Rand 66 lock sync.Mutex 67 } 68 69 // SetPolicy updates the policy used to determine restarts. 70 func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) { 71 r.lock.Lock() 72 defer r.lock.Unlock() 73 r.policy = policy 74 } 75 76 // GetPolicy returns a copy of the policy used to determine restarts. 77 func (r *RestartTracker) GetPolicy() *structs.RestartPolicy { 78 r.lock.Lock() 79 defer r.lock.Unlock() 80 return r.policy.Copy() 81 } 82 83 // SetStartError is used to mark the most recent start error. If starting was 84 // successful the error should be nil. 85 func (r *RestartTracker) SetStartError(err error) *RestartTracker { 86 r.lock.Lock() 87 defer r.lock.Unlock() 88 r.startErr = err 89 r.failure = true 90 return r 91 } 92 93 // SetExitResult is used to mark the most recent wait result. 94 func (r *RestartTracker) SetExitResult(res *drivers.ExitResult) *RestartTracker { 95 r.lock.Lock() 96 defer r.lock.Unlock() 97 r.exitRes = res 98 r.failure = true 99 return r 100 } 101 102 // SetRestartTriggered is used to mark that the task has been signalled to be 103 // restarted. Setting the failure to true restarts according to the restart 104 // policy. When failure is false the task is restarted without considering the 105 // restart policy. 106 func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker { 107 r.lock.Lock() 108 defer r.lock.Unlock() 109 if failure { 110 r.failure = true 111 } else { 112 r.restartTriggered = true 113 } 114 return r 115 } 116 117 // SetKilled is used to mark that the task has been killed. 118 func (r *RestartTracker) SetKilled() *RestartTracker { 119 r.lock.Lock() 120 defer r.lock.Unlock() 121 r.killed = true 122 return r 123 } 124 125 // GetReason returns a human-readable description for the last state returned by 126 // GetState. 127 func (r *RestartTracker) GetReason() string { 128 r.lock.Lock() 129 defer r.lock.Unlock() 130 return r.reason 131 } 132 133 // GetCount returns the current restart count 134 func (r *RestartTracker) GetCount() int { 135 r.lock.Lock() 136 defer r.lock.Unlock() 137 return r.count 138 } 139 140 // GetState returns the tasks next state given the set exit code and start 141 // error. One of the following states are returned: 142 // - TaskRestarting - Task should be restarted 143 // - TaskNotRestarting - Task should not be restarted and has exceeded its 144 // restart policy. 145 // - TaskTerminated - Task has terminated successfully and does not need a 146 // restart. 147 // 148 // If TaskRestarting is returned, the duration is how long to wait until 149 // starting the task again. 150 func (r *RestartTracker) GetState() (string, time.Duration) { 151 r.lock.Lock() 152 defer r.lock.Unlock() 153 154 // Clear out the existing state 155 defer func() { 156 r.startErr = nil 157 r.exitRes = nil 158 r.restartTriggered = false 159 r.failure = false 160 r.killed = false 161 }() 162 163 // Hot path if task was killed 164 if r.killed { 165 r.reason = "" 166 return structs.TaskKilled, 0 167 } 168 169 // Hot path if a restart was triggered 170 if r.restartTriggered { 171 r.reason = "" 172 return structs.TaskRestarting, 0 173 } 174 175 // Hot path if no attempts are expected 176 if r.policy.Attempts == 0 { 177 r.reason = ReasonNoRestartsAllowed 178 179 // If the task does not restart on a successful exit code and 180 // the exit code was successful: terminate. 181 if !r.onSuccess && r.exitRes != nil && r.exitRes.Successful() { 182 return structs.TaskTerminated, 0 183 } 184 185 // Task restarts even on a successful exit code but no restarts 186 // allowed. 187 return structs.TaskNotRestarting, 0 188 } 189 190 // Check if we have entered a new interval. 191 end := r.startTime.Add(r.policy.Interval) 192 now := time.Now() 193 if now.After(end) { 194 r.count = 0 195 r.startTime = now 196 } 197 198 r.count++ 199 200 // Handle restarts due to failures 201 if !r.failure { 202 return "", 0 203 } 204 205 if r.startErr != nil { 206 // If the error is not recoverable, do not restart. 207 if !structs.IsRecoverable(r.startErr) { 208 r.reason = ReasonUnrecoverableError 209 return structs.TaskNotRestarting, 0 210 } 211 } else if r.exitRes != nil { 212 // If the task started successfully and restart on success isn't specified, 213 // don't restart but don't mark as failed. 214 if r.exitRes.Successful() && !r.onSuccess { 215 r.reason = "Restart unnecessary as task terminated successfully" 216 return structs.TaskTerminated, 0 217 } 218 } 219 220 // If this task has been restarted due to failures more times 221 // than the restart policy allows within an interval fail 222 // according to the restart policy's mode. 223 if r.count > r.policy.Attempts { 224 if r.policy.Mode == structs.RestartPolicyModeFail { 225 r.reason = fmt.Sprintf( 226 `Exceeded allowed attempts %d in interval %v and mode is "fail"`, 227 r.policy.Attempts, r.policy.Interval) 228 return structs.TaskNotRestarting, 0 229 } else { 230 r.reason = ReasonDelay 231 return structs.TaskRestarting, r.getDelay() 232 } 233 } 234 235 r.reason = ReasonWithinPolicy 236 return structs.TaskRestarting, r.jitter() 237 } 238 239 // getDelay returns the delay time to enter the next interval. 240 func (r *RestartTracker) getDelay() time.Duration { 241 end := r.startTime.Add(r.policy.Interval) 242 now := time.Now() 243 return end.Sub(now) 244 } 245 246 // jitter returns the delay time plus a jitter. 247 func (r *RestartTracker) jitter() time.Duration { 248 // Get the delay and ensure it is valid. 249 d := r.policy.Delay.Nanoseconds() 250 if d == 0 { 251 d = 1 252 } 253 254 j := float64(r.rand.Int63n(d)) * jitter 255 return time.Duration(d + int64(j)) 256 }