github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/client/restarts.go (about) 1 package client 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sync" 7 "time" 8 9 dstructs "github.com/hashicorp/nomad/client/driver/structs" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 const ( 14 // jitter is the percent of jitter added to restart delays. 15 jitter = 0.25 16 17 ReasonNoRestartsAllowed = "Policy allows no restarts" 18 ReasonUnrecoverableErrror = "Error was unrecoverable" 19 ReasonWithinPolicy = "Restart within policy" 20 ReasonDelay = "Exceeded allowed attempts, applying a delay" 21 ) 22 23 func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker { 24 onSuccess := true 25 if jobType == structs.JobTypeBatch { 26 onSuccess = false 27 } 28 return &RestartTracker{ 29 startTime: time.Now(), 30 onSuccess: onSuccess, 31 policy: policy, 32 rand: rand.New(rand.NewSource(time.Now().Unix())), 33 } 34 } 35 36 type RestartTracker struct { 37 waitRes *dstructs.WaitResult 38 startErr error 39 restartTriggered bool // Whether the task has been signalled to be restarted 40 count int // Current number of attempts. 41 onSuccess bool // Whether to restart on successful exit code. 42 startTime time.Time // When the interval began 43 reason string // The reason for the last state 44 policy *structs.RestartPolicy 45 rand *rand.Rand 46 lock sync.Mutex 47 } 48 49 // SetPolicy updates the policy used to determine restarts. 50 func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) { 51 r.lock.Lock() 52 defer r.lock.Unlock() 53 r.policy = policy 54 } 55 56 // SetStartError is used to mark the most recent start error. If starting was 57 // successful the error should be nil. 58 func (r *RestartTracker) SetStartError(err error) *RestartTracker { 59 r.lock.Lock() 60 defer r.lock.Unlock() 61 r.startErr = err 62 return r 63 } 64 65 // SetWaitResult is used to mark the most recent wait result. 66 func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker { 67 r.lock.Lock() 68 defer r.lock.Unlock() 69 r.waitRes = res 70 return r 71 } 72 73 // SetRestartTriggered is used to mark that the task has been signalled to be 74 // restarted 75 func (r *RestartTracker) SetRestartTriggered() *RestartTracker { 76 r.lock.Lock() 77 defer r.lock.Unlock() 78 r.restartTriggered = true 79 return r 80 } 81 82 // GetReason returns a human-readable description for the last state returned by 83 // GetState. 84 func (r *RestartTracker) GetReason() string { 85 r.lock.Lock() 86 defer r.lock.Unlock() 87 return r.reason 88 } 89 90 // GetState returns the tasks next state given the set exit code and start 91 // error. One of the following states are returned: 92 // * TaskRestarting - Task should be restarted 93 // * TaskNotRestarting - Task should not be restarted and has exceeded its 94 // restart policy. 95 // * TaskTerminated - Task has terminated successfully and does not need a 96 // restart. 97 // 98 // If TaskRestarting is returned, the duration is how long to wait until 99 // starting the task again. 100 func (r *RestartTracker) GetState() (string, time.Duration) { 101 r.lock.Lock() 102 defer r.lock.Unlock() 103 104 // Clear out the existing state 105 defer func() { 106 r.startErr = nil 107 r.waitRes = nil 108 r.restartTriggered = false 109 }() 110 111 // Hot path if a restart was triggered 112 if r.restartTriggered { 113 r.reason = "" 114 return structs.TaskRestarting, 0 115 } 116 117 // Hot path if no attempts are expected 118 if r.policy.Attempts == 0 { 119 r.reason = ReasonNoRestartsAllowed 120 if r.waitRes != nil && r.waitRes.Successful() { 121 return structs.TaskTerminated, 0 122 } 123 124 return structs.TaskNotRestarting, 0 125 } 126 127 r.count++ 128 129 // Check if we have entered a new interval. 130 end := r.startTime.Add(r.policy.Interval) 131 now := time.Now() 132 if now.After(end) { 133 r.count = 0 134 r.startTime = now 135 } 136 137 if r.startErr != nil { 138 return r.handleStartError() 139 } else if r.waitRes != nil { 140 return r.handleWaitResult() 141 } 142 143 return "", 0 144 } 145 146 // handleStartError returns the new state and potential wait duration for 147 // restarting the task after it was not successfully started. On start errors, 148 // the restart policy is always treated as fail mode to ensure we don't 149 // infinitely try to start a task. 150 func (r *RestartTracker) handleStartError() (string, time.Duration) { 151 // If the error is not recoverable, do not restart. 152 if !structs.IsRecoverable(r.startErr) { 153 r.reason = ReasonUnrecoverableErrror 154 return structs.TaskNotRestarting, 0 155 } 156 157 if r.count > r.policy.Attempts { 158 if r.policy.Mode == structs.RestartPolicyModeFail { 159 r.reason = fmt.Sprintf( 160 `Exceeded allowed attempts %d in interval %v and mode is "fail"`, 161 r.policy.Attempts, r.policy.Interval) 162 return structs.TaskNotRestarting, 0 163 } else { 164 r.reason = ReasonDelay 165 return structs.TaskRestarting, r.getDelay() 166 } 167 } 168 169 r.reason = ReasonWithinPolicy 170 return structs.TaskRestarting, r.jitter() 171 } 172 173 // handleWaitResult returns the new state and potential wait duration for 174 // restarting the task after it has exited. 175 func (r *RestartTracker) handleWaitResult() (string, time.Duration) { 176 // If the task started successfully and restart on success isn't specified, 177 // don't restart but don't mark as failed. 178 if r.waitRes.Successful() && !r.onSuccess { 179 r.reason = "Restart unnecessary as task terminated successfully" 180 return structs.TaskTerminated, 0 181 } 182 183 if r.count > r.policy.Attempts { 184 if r.policy.Mode == structs.RestartPolicyModeFail { 185 r.reason = fmt.Sprintf( 186 `Exceeded allowed attempts %d in interval %v and mode is "fail"`, 187 r.policy.Attempts, r.policy.Interval) 188 return structs.TaskNotRestarting, 0 189 } else { 190 r.reason = ReasonDelay 191 return structs.TaskRestarting, r.getDelay() 192 } 193 } 194 195 r.reason = ReasonWithinPolicy 196 return structs.TaskRestarting, r.jitter() 197 } 198 199 // getDelay returns the delay time to enter the next interval. 200 func (r *RestartTracker) getDelay() time.Duration { 201 end := r.startTime.Add(r.policy.Interval) 202 now := time.Now() 203 return end.Sub(now) 204 } 205 206 // jitter returns the delay time plus a jitter. 207 func (r *RestartTracker) jitter() time.Duration { 208 // Get the delay and ensure it is valid. 209 d := r.policy.Delay.Nanoseconds() 210 if d == 0 { 211 d = 1 212 } 213 214 j := float64(r.rand.Int63n(d)) * jitter 215 return time.Duration(d + int64(j)) 216 } 217 218 // Returns a tracker that never restarts. 219 func noRestartsTracker() *RestartTracker { 220 policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail} 221 return newRestartTracker(policy, structs.JobTypeBatch) 222 }