github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/client/restarts.go (about) 1 package client 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sync" 7 "time" 8 9 dstructs "github.com/hashicorp/nomad/client/driver/structs" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 const ( 14 // jitter is the percent of jitter added to restart delays. 15 jitter = 0.25 16 17 ReasonNoRestartsAllowed = "Policy allows no restarts" 18 ReasonUnrecoverableErrror = "Error was unrecoverable" 19 ReasonWithinPolicy = "Restart within policy" 20 ReasonDelay = "Exceeded allowed attempts, applying a delay" 21 ) 22 23 func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker { 24 onSuccess := true 25 if jobType == structs.JobTypeBatch { 26 onSuccess = false 27 } 28 return &RestartTracker{ 29 startTime: time.Now(), 30 onSuccess: onSuccess, 31 policy: policy, 32 rand: rand.New(rand.NewSource(time.Now().Unix())), 33 } 34 } 35 36 type RestartTracker struct { 37 waitRes *dstructs.WaitResult 38 startErr error 39 restartTriggered bool // Whether the task has been signalled to be restarted 40 failure bool // Whether a failure triggered the restart 41 count int // Current number of attempts. 42 onSuccess bool // Whether to restart on successful exit code. 43 startTime time.Time // When the interval began 44 reason string // The reason for the last state 45 policy *structs.RestartPolicy 46 rand *rand.Rand 47 lock sync.Mutex 48 } 49 50 // SetPolicy updates the policy used to determine restarts. 51 func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) { 52 r.lock.Lock() 53 defer r.lock.Unlock() 54 r.policy = policy 55 } 56 57 // SetStartError is used to mark the most recent start error. If starting was 58 // successful the error should be nil. 59 func (r *RestartTracker) SetStartError(err error) *RestartTracker { 60 r.lock.Lock() 61 defer r.lock.Unlock() 62 r.startErr = err 63 r.failure = true 64 return r 65 } 66 67 // SetWaitResult is used to mark the most recent wait result. 68 func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker { 69 r.lock.Lock() 70 defer r.lock.Unlock() 71 r.waitRes = res 72 r.failure = true 73 return r 74 } 75 76 // SetRestartTriggered is used to mark that the task has been signalled to be 77 // restarted. Setting the failure to true restarts according to the restart 78 // policy. When failure is false the task is restarted without considering the 79 // restart policy. 80 func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker { 81 r.lock.Lock() 82 defer r.lock.Unlock() 83 if failure { 84 r.failure = true 85 } else { 86 r.restartTriggered = true 87 } 88 return r 89 } 90 91 // GetReason returns a human-readable description for the last state returned by 92 // GetState. 93 func (r *RestartTracker) GetReason() string { 94 r.lock.Lock() 95 defer r.lock.Unlock() 96 return r.reason 97 } 98 99 // GetState returns the tasks next state given the set exit code and start 100 // error. One of the following states are returned: 101 // * TaskRestarting - Task should be restarted 102 // * TaskNotRestarting - Task should not be restarted and has exceeded its 103 // restart policy. 104 // * TaskTerminated - Task has terminated successfully and does not need a 105 // restart. 106 // 107 // If TaskRestarting is returned, the duration is how long to wait until 108 // starting the task again. 109 func (r *RestartTracker) GetState() (string, time.Duration) { 110 r.lock.Lock() 111 defer r.lock.Unlock() 112 113 // Clear out the existing state 114 defer func() { 115 r.startErr = nil 116 r.waitRes = nil 117 r.restartTriggered = false 118 r.failure = false 119 }() 120 121 // Hot path if a restart was triggered 122 if r.restartTriggered { 123 r.reason = "" 124 return structs.TaskRestarting, 0 125 } 126 127 // Hot path if no attempts are expected 128 if r.policy.Attempts == 0 { 129 r.reason = ReasonNoRestartsAllowed 130 if r.waitRes != nil && r.waitRes.Successful() { 131 return structs.TaskTerminated, 0 132 } 133 134 return structs.TaskNotRestarting, 0 135 } 136 137 r.count++ 138 139 // Check if we have entered a new interval. 140 end := r.startTime.Add(r.policy.Interval) 141 now := time.Now() 142 if now.After(end) { 143 r.count = 0 144 r.startTime = now 145 } 146 147 // Handle restarts due to failures 148 if !r.failure { 149 return "", 0 150 } 151 152 if r.startErr != nil { 153 // If the error is not recoverable, do not restart. 154 if !structs.IsRecoverable(r.startErr) { 155 r.reason = ReasonUnrecoverableErrror 156 return structs.TaskNotRestarting, 0 157 } 158 } else if r.waitRes != nil { 159 // If the task started successfully and restart on success isn't specified, 160 // don't restart but don't mark as failed. 161 if r.waitRes.Successful() && !r.onSuccess { 162 r.reason = "Restart unnecessary as task terminated successfully" 163 return structs.TaskTerminated, 0 164 } 165 } 166 167 // If this task has been restarted due to failures more times 168 // than the restart policy allows within an interval fail 169 // according to the restart policy's mode. 170 if r.count > r.policy.Attempts { 171 if r.policy.Mode == structs.RestartPolicyModeFail { 172 r.reason = fmt.Sprintf( 173 `Exceeded allowed attempts %d in interval %v and mode is "fail"`, 174 r.policy.Attempts, r.policy.Interval) 175 return structs.TaskNotRestarting, 0 176 } else { 177 r.reason = ReasonDelay 178 return structs.TaskRestarting, r.getDelay() 179 } 180 } 181 182 r.reason = ReasonWithinPolicy 183 return structs.TaskRestarting, r.jitter() 184 } 185 186 // getDelay returns the delay time to enter the next interval. 187 func (r *RestartTracker) getDelay() time.Duration { 188 end := r.startTime.Add(r.policy.Interval) 189 now := time.Now() 190 return end.Sub(now) 191 } 192 193 // jitter returns the delay time plus a jitter. 194 func (r *RestartTracker) jitter() time.Duration { 195 // Get the delay and ensure it is valid. 196 d := r.policy.Delay.Nanoseconds() 197 if d == 0 { 198 d = 1 199 } 200 201 j := float64(r.rand.Int63n(d)) * jitter 202 return time.Duration(d + int64(j)) 203 }