github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allocrunner/taskrunner/restarts/restarts.go (about) 1 package restarts 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/nomad/plugins/drivers" 11 ) 12 13 const ( 14 // jitter is the percent of jitter added to restart delays. 15 jitter = 0.25 16 17 ReasonNoRestartsAllowed = "Policy allows no restarts" 18 ReasonUnrecoverableErrror = "Error was unrecoverable" 19 ReasonWithinPolicy = "Restart within policy" 20 ReasonDelay = "Exceeded allowed attempts, applying a delay" 21 ) 22 23 func NewRestartTracker(policy *structs.RestartPolicy, jobType string, tlc *structs.TaskLifecycleConfig) *RestartTracker { 24 // Batch jobs should not restart if they exit successfully 25 onSuccess := jobType != structs.JobTypeBatch 26 27 // Prestart sidecars should get restarted on success 28 if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPrestart { 29 onSuccess = tlc.Sidecar 30 } 31 32 // Poststart sidecars should get restarted on success 33 if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPoststart { 34 onSuccess = tlc.Sidecar 35 } 36 37 // Poststop should never be restarted on success 38 if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPoststop { 39 onSuccess = false 40 } 41 42 return &RestartTracker{ 43 startTime: time.Now(), 44 onSuccess: onSuccess, 45 policy: policy, 46 rand: rand.New(rand.NewSource(time.Now().Unix())), 47 } 48 } 49 50 type RestartTracker struct { 51 exitRes *drivers.ExitResult 52 startErr error 53 killed bool // Whether the task has been killed 54 restartTriggered bool // Whether the task has been signalled to be restarted 55 failure bool // Whether a failure triggered the restart 56 count int // Current number of attempts. 57 onSuccess bool // Whether to restart on successful exit code. 58 startTime time.Time // When the interval began 59 reason string // The reason for the last state 60 policy *structs.RestartPolicy 61 rand *rand.Rand 62 lock sync.Mutex 63 } 64 65 // SetPolicy updates the policy used to determine restarts. 66 func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) { 67 r.lock.Lock() 68 defer r.lock.Unlock() 69 r.policy = policy 70 } 71 72 // GetPolicy returns a copy of the policy used to determine restarts. 73 func (r *RestartTracker) GetPolicy() *structs.RestartPolicy { 74 r.lock.Lock() 75 defer r.lock.Unlock() 76 return r.policy.Copy() 77 } 78 79 // SetStartError is used to mark the most recent start error. If starting was 80 // successful the error should be nil. 81 func (r *RestartTracker) SetStartError(err error) *RestartTracker { 82 r.lock.Lock() 83 defer r.lock.Unlock() 84 r.startErr = err 85 r.failure = true 86 return r 87 } 88 89 // SetExitResult is used to mark the most recent wait result. 90 func (r *RestartTracker) SetExitResult(res *drivers.ExitResult) *RestartTracker { 91 r.lock.Lock() 92 defer r.lock.Unlock() 93 r.exitRes = res 94 r.failure = true 95 return r 96 } 97 98 // SetRestartTriggered is used to mark that the task has been signalled to be 99 // restarted. Setting the failure to true restarts according to the restart 100 // policy. When failure is false the task is restarted without considering the 101 // restart policy. 102 func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker { 103 r.lock.Lock() 104 defer r.lock.Unlock() 105 if failure { 106 r.failure = true 107 } else { 108 r.restartTriggered = true 109 } 110 return r 111 } 112 113 // SetKilled is used to mark that the task has been killed. 114 func (r *RestartTracker) SetKilled() *RestartTracker { 115 r.lock.Lock() 116 defer r.lock.Unlock() 117 r.killed = true 118 return r 119 } 120 121 // GetReason returns a human-readable description for the last state returned by 122 // GetState. 123 func (r *RestartTracker) GetReason() string { 124 r.lock.Lock() 125 defer r.lock.Unlock() 126 return r.reason 127 } 128 129 // GetCount returns the current restart count 130 func (r *RestartTracker) GetCount() int { 131 r.lock.Lock() 132 defer r.lock.Unlock() 133 return r.count 134 } 135 136 // GetState returns the tasks next state given the set exit code and start 137 // error. One of the following states are returned: 138 // * TaskRestarting - Task should be restarted 139 // * TaskNotRestarting - Task should not be restarted and has exceeded its 140 // restart policy. 141 // * TaskTerminated - Task has terminated successfully and does not need a 142 // restart. 143 // 144 // If TaskRestarting is returned, the duration is how long to wait until 145 // starting the task again. 146 func (r *RestartTracker) GetState() (string, time.Duration) { 147 r.lock.Lock() 148 defer r.lock.Unlock() 149 150 // Clear out the existing state 151 defer func() { 152 r.startErr = nil 153 r.exitRes = nil 154 r.restartTriggered = false 155 r.failure = false 156 r.killed = false 157 }() 158 159 // Hot path if task was killed 160 if r.killed { 161 r.reason = "" 162 return structs.TaskKilled, 0 163 } 164 165 // Hot path if a restart was triggered 166 if r.restartTriggered { 167 r.reason = "" 168 return structs.TaskRestarting, 0 169 } 170 171 // Hot path if no attempts are expected 172 if r.policy.Attempts == 0 { 173 r.reason = ReasonNoRestartsAllowed 174 175 // If the task does not restart on a successful exit code and 176 // the exit code was successful: terminate. 177 if !r.onSuccess && r.exitRes != nil && r.exitRes.Successful() { 178 return structs.TaskTerminated, 0 179 } 180 181 // Task restarts even on a successful exit code but no restarts 182 // allowed. 183 return structs.TaskNotRestarting, 0 184 } 185 186 // Check if we have entered a new interval. 187 end := r.startTime.Add(r.policy.Interval) 188 now := time.Now() 189 if now.After(end) { 190 r.count = 0 191 r.startTime = now 192 } 193 194 r.count++ 195 196 // Handle restarts due to failures 197 if !r.failure { 198 return "", 0 199 } 200 201 if r.startErr != nil { 202 // If the error is not recoverable, do not restart. 203 if !structs.IsRecoverable(r.startErr) { 204 r.reason = ReasonUnrecoverableErrror 205 return structs.TaskNotRestarting, 0 206 } 207 } else if r.exitRes != nil { 208 // If the task started successfully and restart on success isn't specified, 209 // don't restart but don't mark as failed. 210 if r.exitRes.Successful() && !r.onSuccess { 211 r.reason = "Restart unnecessary as task terminated successfully" 212 return structs.TaskTerminated, 0 213 } 214 } 215 216 // If this task has been restarted due to failures more times 217 // than the restart policy allows within an interval fail 218 // according to the restart policy's mode. 219 if r.count > r.policy.Attempts { 220 if r.policy.Mode == structs.RestartPolicyModeFail { 221 r.reason = fmt.Sprintf( 222 `Exceeded allowed attempts %d in interval %v and mode is "fail"`, 223 r.policy.Attempts, r.policy.Interval) 224 return structs.TaskNotRestarting, 0 225 } else { 226 r.reason = ReasonDelay 227 return structs.TaskRestarting, r.getDelay() 228 } 229 } 230 231 r.reason = ReasonWithinPolicy 232 return structs.TaskRestarting, r.jitter() 233 } 234 235 // getDelay returns the delay time to enter the next interval. 236 func (r *RestartTracker) getDelay() time.Duration { 237 end := r.startTime.Add(r.policy.Interval) 238 now := time.Now() 239 return end.Sub(now) 240 } 241 242 // jitter returns the delay time plus a jitter. 243 func (r *RestartTracker) jitter() time.Duration { 244 // Get the delay and ensure it is valid. 245 d := r.policy.Delay.Nanoseconds() 246 if d == 0 { 247 d = 1 248 } 249 250 j := float64(r.rand.Int63n(d)) * jitter 251 return time.Duration(d + int64(j)) 252 }