github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/taskrunner/restarts/restarts.go (about) 1 package restarts 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/nomad/plugins/drivers" 11 ) 12 13 const ( 14 // jitter is the percent of jitter added to restart delays. 15 jitter = 0.25 16 17 ReasonNoRestartsAllowed = "Policy allows no restarts" 18 ReasonUnrecoverableErrror = "Error was unrecoverable" 19 ReasonWithinPolicy = "Restart within policy" 20 ReasonDelay = "Exceeded allowed attempts, applying a delay" 21 ) 22 23 func NewRestartTracker(policy *structs.RestartPolicy, jobType string, tlc *structs.TaskLifecycleConfig) *RestartTracker { 24 onSuccess := jobType != structs.JobTypeBatch 25 if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPrestart { 26 onSuccess = tlc.Sidecar 27 } 28 29 return &RestartTracker{ 30 startTime: time.Now(), 31 onSuccess: onSuccess, 32 policy: policy, 33 rand: rand.New(rand.NewSource(time.Now().Unix())), 34 } 35 } 36 37 type RestartTracker struct { 38 exitRes *drivers.ExitResult 39 startErr error 40 killed bool // Whether the task has been killed 41 restartTriggered bool // Whether the task has been signalled to be restarted 42 failure bool // Whether a failure triggered the restart 43 count int // Current number of attempts. 44 onSuccess bool // Whether to restart on successful exit code. 45 startTime time.Time // When the interval began 46 reason string // The reason for the last state 47 policy *structs.RestartPolicy 48 rand *rand.Rand 49 lock sync.Mutex 50 } 51 52 // SetPolicy updates the policy used to determine restarts. 53 func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) { 54 r.lock.Lock() 55 defer r.lock.Unlock() 56 r.policy = policy 57 } 58 59 // GetPolicy returns a copy of the policy used to determine restarts. 60 func (r *RestartTracker) GetPolicy() *structs.RestartPolicy { 61 r.lock.Lock() 62 defer r.lock.Unlock() 63 return r.policy.Copy() 64 } 65 66 // SetStartError is used to mark the most recent start error. If starting was 67 // successful the error should be nil. 68 func (r *RestartTracker) SetStartError(err error) *RestartTracker { 69 r.lock.Lock() 70 defer r.lock.Unlock() 71 r.startErr = err 72 r.failure = true 73 return r 74 } 75 76 // SetExitResult is used to mark the most recent wait result. 77 func (r *RestartTracker) SetExitResult(res *drivers.ExitResult) *RestartTracker { 78 r.lock.Lock() 79 defer r.lock.Unlock() 80 r.exitRes = res 81 r.failure = true 82 return r 83 } 84 85 // SetRestartTriggered is used to mark that the task has been signalled to be 86 // restarted. Setting the failure to true restarts according to the restart 87 // policy. When failure is false the task is restarted without considering the 88 // restart policy. 89 func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker { 90 r.lock.Lock() 91 defer r.lock.Unlock() 92 if failure { 93 r.failure = true 94 } else { 95 r.restartTriggered = true 96 } 97 return r 98 } 99 100 // SetKilled is used to mark that the task has been killed. 101 func (r *RestartTracker) SetKilled() *RestartTracker { 102 r.lock.Lock() 103 defer r.lock.Unlock() 104 r.killed = true 105 return r 106 } 107 108 // GetReason returns a human-readable description for the last state returned by 109 // GetState. 110 func (r *RestartTracker) GetReason() string { 111 r.lock.Lock() 112 defer r.lock.Unlock() 113 return r.reason 114 } 115 116 // GetCount returns the current restart count 117 func (r *RestartTracker) GetCount() int { 118 r.lock.Lock() 119 defer r.lock.Unlock() 120 return r.count 121 } 122 123 // GetState returns the tasks next state given the set exit code and start 124 // error. One of the following states are returned: 125 // * TaskRestarting - Task should be restarted 126 // * TaskNotRestarting - Task should not be restarted and has exceeded its 127 // restart policy. 128 // * TaskTerminated - Task has terminated successfully and does not need a 129 // restart. 130 // 131 // If TaskRestarting is returned, the duration is how long to wait until 132 // starting the task again. 133 func (r *RestartTracker) GetState() (string, time.Duration) { 134 r.lock.Lock() 135 defer r.lock.Unlock() 136 137 // Clear out the existing state 138 defer func() { 139 r.startErr = nil 140 r.exitRes = nil 141 r.restartTriggered = false 142 r.failure = false 143 r.killed = false 144 }() 145 146 // Hot path if task was killed 147 if r.killed { 148 r.reason = "" 149 return structs.TaskKilled, 0 150 } 151 152 // Hot path if a restart was triggered 153 if r.restartTriggered { 154 r.reason = "" 155 return structs.TaskRestarting, 0 156 } 157 158 // Hot path if no attempts are expected 159 if r.policy.Attempts == 0 { 160 r.reason = ReasonNoRestartsAllowed 161 162 // If the task does not restart on a successful exit code and 163 // the exit code was successful: terminate. 164 if !r.onSuccess && r.exitRes != nil && r.exitRes.Successful() { 165 return structs.TaskTerminated, 0 166 } 167 168 // Task restarts even on a successful exit code but no restarts 169 // allowed. 170 return structs.TaskNotRestarting, 0 171 } 172 173 // Check if we have entered a new interval. 174 end := r.startTime.Add(r.policy.Interval) 175 now := time.Now() 176 if now.After(end) { 177 r.count = 0 178 r.startTime = now 179 } 180 181 r.count++ 182 183 // Handle restarts due to failures 184 if !r.failure { 185 return "", 0 186 } 187 188 if r.startErr != nil { 189 // If the error is not recoverable, do not restart. 190 if !structs.IsRecoverable(r.startErr) { 191 r.reason = ReasonUnrecoverableErrror 192 return structs.TaskNotRestarting, 0 193 } 194 } else if r.exitRes != nil { 195 // If the task started successfully and restart on success isn't specified, 196 // don't restart but don't mark as failed. 197 if r.exitRes.Successful() && !r.onSuccess { 198 r.reason = "Restart unnecessary as task terminated successfully" 199 return structs.TaskTerminated, 0 200 } 201 } 202 203 // If this task has been restarted due to failures more times 204 // than the restart policy allows within an interval fail 205 // according to the restart policy's mode. 206 if r.count > r.policy.Attempts { 207 if r.policy.Mode == structs.RestartPolicyModeFail { 208 r.reason = fmt.Sprintf( 209 `Exceeded allowed attempts %d in interval %v and mode is "fail"`, 210 r.policy.Attempts, r.policy.Interval) 211 return structs.TaskNotRestarting, 0 212 } else { 213 r.reason = ReasonDelay 214 return structs.TaskRestarting, r.getDelay() 215 } 216 } 217 218 r.reason = ReasonWithinPolicy 219 return structs.TaskRestarting, r.jitter() 220 } 221 222 // getDelay returns the delay time to enter the next interval. 223 func (r *RestartTracker) getDelay() time.Duration { 224 end := r.startTime.Add(r.policy.Interval) 225 now := time.Now() 226 return end.Sub(now) 227 } 228 229 // jitter returns the delay time plus a jitter. 230 func (r *RestartTracker) jitter() time.Duration { 231 // Get the delay and ensure it is valid. 232 d := r.policy.Delay.Nanoseconds() 233 if d == 0 { 234 d = 1 235 } 236 237 j := float64(r.rand.Int63n(d)) * jitter 238 return time.Duration(d + int64(j)) 239 }