github.com/bigcommerce/nomad@v0.9.3-bc/client/allocrunner/taskrunner/restarts/restarts.go (about) 1 package restarts 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/nomad/plugins/drivers" 11 ) 12 13 const ( 14 // jitter is the percent of jitter added to restart delays. 15 jitter = 0.25 16 17 ReasonNoRestartsAllowed = "Policy allows no restarts" 18 ReasonUnrecoverableErrror = "Error was unrecoverable" 19 ReasonWithinPolicy = "Restart within policy" 20 ReasonDelay = "Exceeded allowed attempts, applying a delay" 21 ) 22 23 func NewRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker { 24 onSuccess := true 25 if jobType == structs.JobTypeBatch { 26 onSuccess = false 27 } 28 return &RestartTracker{ 29 startTime: time.Now(), 30 onSuccess: onSuccess, 31 policy: policy, 32 rand: rand.New(rand.NewSource(time.Now().Unix())), 33 } 34 } 35 36 type RestartTracker struct { 37 exitRes *drivers.ExitResult 38 startErr error 39 killed bool // Whether the task has been killed 40 restartTriggered bool // Whether the task has been signalled to be restarted 41 failure bool // Whether a failure triggered the restart 42 count int // Current number of attempts. 43 onSuccess bool // Whether to restart on successful exit code. 44 startTime time.Time // When the interval began 45 reason string // The reason for the last state 46 policy *structs.RestartPolicy 47 rand *rand.Rand 48 lock sync.Mutex 49 } 50 51 // SetPolicy updates the policy used to determine restarts. 52 func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) { 53 r.lock.Lock() 54 defer r.lock.Unlock() 55 r.policy = policy 56 } 57 58 // GetPolicy returns a copy of the policy used to determine restarts. 59 func (r *RestartTracker) GetPolicy() *structs.RestartPolicy { 60 r.lock.Lock() 61 defer r.lock.Unlock() 62 return r.policy.Copy() 63 } 64 65 // SetStartError is used to mark the most recent start error. If starting was 66 // successful the error should be nil. 67 func (r *RestartTracker) SetStartError(err error) *RestartTracker { 68 r.lock.Lock() 69 defer r.lock.Unlock() 70 r.startErr = err 71 r.failure = true 72 return r 73 } 74 75 // SetExitResult is used to mark the most recent wait result. 76 func (r *RestartTracker) SetExitResult(res *drivers.ExitResult) *RestartTracker { 77 r.lock.Lock() 78 defer r.lock.Unlock() 79 r.exitRes = res 80 r.failure = true 81 return r 82 } 83 84 // SetRestartTriggered is used to mark that the task has been signalled to be 85 // restarted. Setting the failure to true restarts according to the restart 86 // policy. When failure is false the task is restarted without considering the 87 // restart policy. 88 func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker { 89 r.lock.Lock() 90 defer r.lock.Unlock() 91 if failure { 92 r.failure = true 93 } else { 94 r.restartTriggered = true 95 } 96 return r 97 } 98 99 // SetKilled is used to mark that the task has been killed. 100 func (r *RestartTracker) SetKilled() *RestartTracker { 101 r.lock.Lock() 102 defer r.lock.Unlock() 103 r.killed = true 104 return r 105 } 106 107 // GetReason returns a human-readable description for the last state returned by 108 // GetState. 109 func (r *RestartTracker) GetReason() string { 110 r.lock.Lock() 111 defer r.lock.Unlock() 112 return r.reason 113 } 114 115 // GetCount returns the current restart count 116 func (r *RestartTracker) GetCount() int { 117 r.lock.Lock() 118 defer r.lock.Unlock() 119 return r.count 120 } 121 122 // GetState returns the tasks next state given the set exit code and start 123 // error. One of the following states are returned: 124 // * TaskRestarting - Task should be restarted 125 // * TaskNotRestarting - Task should not be restarted and has exceeded its 126 // restart policy. 127 // * TaskTerminated - Task has terminated successfully and does not need a 128 // restart. 129 // 130 // If TaskRestarting is returned, the duration is how long to wait until 131 // starting the task again. 132 func (r *RestartTracker) GetState() (string, time.Duration) { 133 r.lock.Lock() 134 defer r.lock.Unlock() 135 136 // Clear out the existing state 137 defer func() { 138 r.startErr = nil 139 r.exitRes = nil 140 r.restartTriggered = false 141 r.failure = false 142 r.killed = false 143 }() 144 145 // Hot path if task was killed 146 if r.killed { 147 r.reason = "" 148 return structs.TaskKilled, 0 149 } 150 151 // Hot path if a restart was triggered 152 if r.restartTriggered { 153 r.reason = "" 154 return structs.TaskRestarting, 0 155 } 156 157 // Hot path if no attempts are expected 158 if r.policy.Attempts == 0 { 159 r.reason = ReasonNoRestartsAllowed 160 161 // If the task does not restart on a successful exit code and 162 // the exit code was successful: terminate. 163 if !r.onSuccess && r.exitRes != nil && r.exitRes.Successful() { 164 return structs.TaskTerminated, 0 165 } 166 167 // Task restarts even on a successful exit code but no restarts 168 // allowed. 169 return structs.TaskNotRestarting, 0 170 } 171 172 // Check if we have entered a new interval. 173 end := r.startTime.Add(r.policy.Interval) 174 now := time.Now() 175 if now.After(end) { 176 r.count = 0 177 r.startTime = now 178 } 179 180 r.count++ 181 182 // Handle restarts due to failures 183 if !r.failure { 184 return "", 0 185 } 186 187 if r.startErr != nil { 188 // If the error is not recoverable, do not restart. 189 if !structs.IsRecoverable(r.startErr) { 190 r.reason = ReasonUnrecoverableErrror 191 return structs.TaskNotRestarting, 0 192 } 193 } else if r.exitRes != nil { 194 // If the task started successfully and restart on success isn't specified, 195 // don't restart but don't mark as failed. 196 if r.exitRes.Successful() && !r.onSuccess { 197 r.reason = "Restart unnecessary as task terminated successfully" 198 return structs.TaskTerminated, 0 199 } 200 } 201 202 // If this task has been restarted due to failures more times 203 // than the restart policy allows within an interval fail 204 // according to the restart policy's mode. 205 if r.count > r.policy.Attempts { 206 if r.policy.Mode == structs.RestartPolicyModeFail { 207 r.reason = fmt.Sprintf( 208 `Exceeded allowed attempts %d in interval %v and mode is "fail"`, 209 r.policy.Attempts, r.policy.Interval) 210 return structs.TaskNotRestarting, 0 211 } else { 212 r.reason = ReasonDelay 213 return structs.TaskRestarting, r.getDelay() 214 } 215 } 216 217 r.reason = ReasonWithinPolicy 218 return structs.TaskRestarting, r.jitter() 219 } 220 221 // getDelay returns the delay time to enter the next interval. 222 func (r *RestartTracker) getDelay() time.Duration { 223 end := r.startTime.Add(r.policy.Interval) 224 now := time.Now() 225 return end.Sub(now) 226 } 227 228 // jitter returns the delay time plus a jitter. 229 func (r *RestartTracker) jitter() time.Duration { 230 // Get the delay and ensure it is valid. 231 d := r.policy.Delay.Nanoseconds() 232 if d == 0 { 233 d = 1 234 } 235 236 j := float64(r.rand.Int63n(d)) * jitter 237 return time.Duration(d + int64(j)) 238 }