github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/client/restarts.go (about) 1 package client 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sync" 7 "time" 8 9 cstructs "github.com/hashicorp/nomad/client/driver/structs" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 const ( 14 // jitter is the percent of jitter added to restart delays. 15 jitter = 0.25 16 17 ReasonNoRestartsAllowed = "Policy allows no restarts" 18 ReasonUnrecoverableErrror = "Error was unrecoverable" 19 ReasonWithinPolicy = "Restart within policy" 20 ReasonDelay = "Exceeded allowed attempts, applying a delay" 21 ) 22 23 func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker { 24 onSuccess := true 25 if jobType == structs.JobTypeBatch { 26 onSuccess = false 27 } 28 return &RestartTracker{ 29 startTime: time.Now(), 30 onSuccess: onSuccess, 31 policy: policy, 32 rand: rand.New(rand.NewSource(time.Now().Unix())), 33 } 34 } 35 36 type RestartTracker struct { 37 waitRes *cstructs.WaitResult 38 startErr error 39 count int // Current number of attempts. 40 onSuccess bool // Whether to restart on successful exit code. 41 startTime time.Time // When the interval began 42 reason string // The reason for the last state 43 policy *structs.RestartPolicy 44 rand *rand.Rand 45 lock sync.Mutex 46 } 47 48 // SetPolicy updates the policy used to determine restarts. 49 func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) { 50 r.lock.Lock() 51 defer r.lock.Unlock() 52 r.policy = policy 53 } 54 55 // SetStartError is used to mark the most recent start error. If starting was 56 // successful the error should be nil. 57 func (r *RestartTracker) SetStartError(err error) *RestartTracker { 58 r.lock.Lock() 59 defer r.lock.Unlock() 60 r.startErr = err 61 return r 62 } 63 64 // SetWaitResult is used to mark the most recent wait result. 65 func (r *RestartTracker) SetWaitResult(res *cstructs.WaitResult) *RestartTracker { 66 r.lock.Lock() 67 defer r.lock.Unlock() 68 r.waitRes = res 69 return r 70 } 71 72 // GetReason returns a human-readable description for the last state returned by 73 // GetState. 74 func (r *RestartTracker) GetReason() string { 75 r.lock.Lock() 76 defer r.lock.Unlock() 77 return r.reason 78 } 79 80 // GetState returns the tasks next state given the set exit code and start 81 // error. One of the following states are returned: 82 // * TaskRestarting - Task should be restarted 83 // * TaskNotRestarting - Task should not be restarted and has exceeded its 84 // restart policy. 85 // * TaskTerminated - Task has terminated successfully and does not need a 86 // restart. 87 // 88 // If TaskRestarting is returned, the duration is how long to wait until 89 // starting the task again. 90 func (r *RestartTracker) GetState() (string, time.Duration) { 91 r.lock.Lock() 92 defer r.lock.Unlock() 93 94 // Hot path if no attempts are expected 95 if r.policy.Attempts == 0 { 96 r.reason = ReasonNoRestartsAllowed 97 if r.waitRes != nil && r.waitRes.Successful() { 98 return structs.TaskTerminated, 0 99 } 100 101 return structs.TaskNotRestarting, 0 102 } 103 104 r.count++ 105 106 // Check if we have entered a new interval. 107 end := r.startTime.Add(r.policy.Interval) 108 now := time.Now() 109 if now.After(end) { 110 r.count = 0 111 r.startTime = now 112 } 113 114 if r.startErr != nil { 115 return r.handleStartError() 116 } else if r.waitRes != nil { 117 return r.handleWaitResult() 118 } else { 119 return "", 0 120 } 121 } 122 123 // handleStartError returns the new state and potential wait duration for 124 // restarting the task after it was not successfully started. On start errors, 125 // the restart policy is always treated as fail mode to ensure we don't 126 // infinitely try to start a task. 127 func (r *RestartTracker) handleStartError() (string, time.Duration) { 128 // If the error is not recoverable, do not restart. 129 if rerr, ok := r.startErr.(*cstructs.RecoverableError); !(ok && rerr.Recoverable) { 130 r.reason = ReasonUnrecoverableErrror 131 return structs.TaskNotRestarting, 0 132 } 133 134 if r.count > r.policy.Attempts { 135 r.reason = fmt.Sprintf("Exceeded allowed attempts %d in interval %v", 136 r.policy.Attempts, r.policy.Interval) 137 return structs.TaskNotRestarting, 0 138 } 139 140 r.reason = ReasonWithinPolicy 141 return structs.TaskRestarting, r.jitter() 142 } 143 144 // handleWaitResult returns the new state and potential wait duration for 145 // restarting the task after it has exited. 146 func (r *RestartTracker) handleWaitResult() (string, time.Duration) { 147 // If the task started successfully and restart on success isn't specified, 148 // don't restart but don't mark as failed. 149 if r.waitRes.Successful() && !r.onSuccess { 150 r.reason = "Restart unnecessary as task terminated successfully" 151 return structs.TaskTerminated, 0 152 } 153 154 if r.count > r.policy.Attempts { 155 if r.policy.Mode == structs.RestartPolicyModeFail { 156 r.reason = fmt.Sprintf( 157 `Exceeded allowed atttempts %d in interval %v and mode is "fail"`, 158 r.policy.Attempts, r.policy.Interval) 159 return structs.TaskNotRestarting, 0 160 } else { 161 r.reason = ReasonDelay 162 return structs.TaskRestarting, r.getDelay() 163 } 164 } 165 166 r.reason = ReasonWithinPolicy 167 return structs.TaskRestarting, r.jitter() 168 } 169 170 // getDelay returns the delay time to enter the next interval. 171 func (r *RestartTracker) getDelay() time.Duration { 172 end := r.startTime.Add(r.policy.Interval) 173 now := time.Now() 174 return end.Sub(now) 175 } 176 177 // jitter returns the delay time plus a jitter. 178 func (r *RestartTracker) jitter() time.Duration { 179 // Get the delay and ensure it is valid. 180 d := r.policy.Delay.Nanoseconds() 181 if d == 0 { 182 d = 1 183 } 184 185 j := float64(r.rand.Int63n(d)) * jitter 186 return time.Duration(d + int64(j)) 187 } 188 189 // Returns a tracker that never restarts. 190 func noRestartsTracker() *RestartTracker { 191 policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail} 192 return newRestartTracker(policy, structs.JobTypeBatch) 193 }