github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/client/allocrunner/taskrunner/restarts/restarts.go (about) 1 package restarts 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sync" 7 "time" 8 9 dstructs "github.com/hashicorp/nomad/client/driver/structs" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 const ( 14 // jitter is the percent of jitter added to restart delays. 15 jitter = 0.25 16 17 ReasonNoRestartsAllowed = "Policy allows no restarts" 18 ReasonUnrecoverableErrror = "Error was unrecoverable" 19 ReasonWithinPolicy = "Restart within policy" 20 ReasonDelay = "Exceeded allowed attempts, applying a delay" 21 ) 22 23 func NewRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker { 24 onSuccess := true 25 if jobType == structs.JobTypeBatch { 26 onSuccess = false 27 } 28 return &RestartTracker{ 29 startTime: time.Now(), 30 onSuccess: onSuccess, 31 policy: policy, 32 rand: rand.New(rand.NewSource(time.Now().Unix())), 33 } 34 } 35 36 type RestartTracker struct { 37 waitRes *dstructs.WaitResult 38 startErr error 39 restartTriggered bool // Whether the task has been signalled to be restarted 40 failure bool // Whether a failure triggered the restart 41 count int // Current number of attempts. 42 onSuccess bool // Whether to restart on successful exit code. 43 startTime time.Time // When the interval began 44 reason string // The reason for the last state 45 policy *structs.RestartPolicy 46 rand *rand.Rand 47 lock sync.Mutex 48 } 49 50 // SetPolicy updates the policy used to determine restarts. 51 func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) { 52 r.lock.Lock() 53 defer r.lock.Unlock() 54 r.policy = policy 55 } 56 57 // GetPolicy returns a copy of the policy used to determine restarts. 58 func (r *RestartTracker) GetPolicy() *structs.RestartPolicy { 59 r.lock.Lock() 60 defer r.lock.Unlock() 61 return r.policy.Copy() 62 } 63 64 // SetStartError is used to mark the most recent start error. If starting was 65 // successful the error should be nil. 66 func (r *RestartTracker) SetStartError(err error) *RestartTracker { 67 r.lock.Lock() 68 defer r.lock.Unlock() 69 r.startErr = err 70 r.failure = true 71 return r 72 } 73 74 // SetWaitResult is used to mark the most recent wait result. 75 func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker { 76 r.lock.Lock() 77 defer r.lock.Unlock() 78 r.waitRes = res 79 r.failure = true 80 return r 81 } 82 83 // SetRestartTriggered is used to mark that the task has been signalled to be 84 // restarted. Setting the failure to true restarts according to the restart 85 // policy. When failure is false the task is restarted without considering the 86 // restart policy. 87 func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker { 88 r.lock.Lock() 89 defer r.lock.Unlock() 90 if failure { 91 r.failure = true 92 } else { 93 r.restartTriggered = true 94 } 95 return r 96 } 97 98 // GetReason returns a human-readable description for the last state returned by 99 // GetState. 100 func (r *RestartTracker) GetReason() string { 101 r.lock.Lock() 102 defer r.lock.Unlock() 103 return r.reason 104 } 105 106 // GetState returns the tasks next state given the set exit code and start 107 // error. One of the following states are returned: 108 // * TaskRestarting - Task should be restarted 109 // * TaskNotRestarting - Task should not be restarted and has exceeded its 110 // restart policy. 111 // * TaskTerminated - Task has terminated successfully and does not need a 112 // restart. 113 // 114 // If TaskRestarting is returned, the duration is how long to wait until 115 // starting the task again. 116 func (r *RestartTracker) GetState() (string, time.Duration) { 117 r.lock.Lock() 118 defer r.lock.Unlock() 119 120 // Clear out the existing state 121 defer func() { 122 r.startErr = nil 123 r.waitRes = nil 124 r.restartTriggered = false 125 r.failure = false 126 }() 127 128 // Hot path if a restart was triggered 129 if r.restartTriggered { 130 r.reason = "" 131 return structs.TaskRestarting, 0 132 } 133 134 // Hot path if no attempts are expected 135 if r.policy.Attempts == 0 { 136 r.reason = ReasonNoRestartsAllowed 137 138 // If the task does not restart on a successful exit code and 139 // the exit code was successful: terminate. 140 if !r.onSuccess && r.waitRes != nil && r.waitRes.Successful() { 141 return structs.TaskTerminated, 0 142 } 143 144 // Task restarts even on a successful exit code but no restarts 145 // allowed. 146 return structs.TaskNotRestarting, 0 147 } 148 149 r.count++ 150 151 // Check if we have entered a new interval. 152 end := r.startTime.Add(r.policy.Interval) 153 now := time.Now() 154 if now.After(end) { 155 r.count = 0 156 r.startTime = now 157 } 158 159 // Handle restarts due to failures 160 if !r.failure { 161 return "", 0 162 } 163 164 if r.startErr != nil { 165 // If the error is not recoverable, do not restart. 166 if !structs.IsRecoverable(r.startErr) { 167 r.reason = ReasonUnrecoverableErrror 168 return structs.TaskNotRestarting, 0 169 } 170 } else if r.waitRes != nil { 171 // If the task started successfully and restart on success isn't specified, 172 // don't restart but don't mark as failed. 173 if r.waitRes.Successful() && !r.onSuccess { 174 r.reason = "Restart unnecessary as task terminated successfully" 175 return structs.TaskTerminated, 0 176 } 177 } 178 179 // If this task has been restarted due to failures more times 180 // than the restart policy allows within an interval fail 181 // according to the restart policy's mode. 182 if r.count > r.policy.Attempts { 183 if r.policy.Mode == structs.RestartPolicyModeFail { 184 r.reason = fmt.Sprintf( 185 `Exceeded allowed attempts %d in interval %v and mode is "fail"`, 186 r.policy.Attempts, r.policy.Interval) 187 return structs.TaskNotRestarting, 0 188 } else { 189 r.reason = ReasonDelay 190 return structs.TaskRestarting, r.getDelay() 191 } 192 } 193 194 r.reason = ReasonWithinPolicy 195 return structs.TaskRestarting, r.jitter() 196 } 197 198 // getDelay returns the delay time to enter the next interval. 199 func (r *RestartTracker) getDelay() time.Duration { 200 end := r.startTime.Add(r.policy.Interval) 201 now := time.Now() 202 return end.Sub(now) 203 } 204 205 // jitter returns the delay time plus a jitter. 206 func (r *RestartTracker) jitter() time.Duration { 207 // Get the delay and ensure it is valid. 208 d := r.policy.Delay.Nanoseconds() 209 if d == 0 { 210 d = 1 211 } 212 213 j := float64(r.rand.Int63n(d)) * jitter 214 return time.Duration(d + int64(j)) 215 }