github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/client/restarts.go (about)

     1  package client
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"sync"
     7  	"time"
     8  
     9  	cstructs "github.com/hashicorp/nomad/client/driver/structs"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  const (
    14  	// jitter is the percent of jitter added to restart delays.
    15  	jitter = 0.25
    16  
    17  	ReasonNoRestartsAllowed   = "Policy allows no restarts"
    18  	ReasonUnrecoverableErrror = "Error was unrecoverable"
    19  	ReasonWithinPolicy        = "Restart within policy"
    20  	ReasonDelay               = "Exceeded allowed attempts, applying a delay"
    21  )
    22  
    23  func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
    24  	onSuccess := true
    25  	if jobType == structs.JobTypeBatch {
    26  		onSuccess = false
    27  	}
    28  	return &RestartTracker{
    29  		startTime: time.Now(),
    30  		onSuccess: onSuccess,
    31  		policy:    policy,
    32  		rand:      rand.New(rand.NewSource(time.Now().Unix())),
    33  	}
    34  }
    35  
    36  type RestartTracker struct {
    37  	waitRes   *cstructs.WaitResult
    38  	startErr  error
    39  	count     int       // Current number of attempts.
    40  	onSuccess bool      // Whether to restart on successful exit code.
    41  	startTime time.Time // When the interval began
    42  	reason    string    // The reason for the last state
    43  	policy    *structs.RestartPolicy
    44  	rand      *rand.Rand
    45  	lock      sync.Mutex
    46  }
    47  
    48  // SetPolicy updates the policy used to determine restarts.
    49  func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) {
    50  	r.lock.Lock()
    51  	defer r.lock.Unlock()
    52  	r.policy = policy
    53  }
    54  
    55  // SetStartError is used to mark the most recent start error. If starting was
    56  // successful the error should be nil.
    57  func (r *RestartTracker) SetStartError(err error) *RestartTracker {
    58  	r.lock.Lock()
    59  	defer r.lock.Unlock()
    60  	r.startErr = err
    61  	return r
    62  }
    63  
    64  // SetWaitResult is used to mark the most recent wait result.
    65  func (r *RestartTracker) SetWaitResult(res *cstructs.WaitResult) *RestartTracker {
    66  	r.lock.Lock()
    67  	defer r.lock.Unlock()
    68  	r.waitRes = res
    69  	return r
    70  }
    71  
    72  // GetReason returns a human-readable description for the last state returned by
    73  // GetState.
    74  func (r *RestartTracker) GetReason() string {
    75  	r.lock.Lock()
    76  	defer r.lock.Unlock()
    77  	return r.reason
    78  }
    79  
    80  // GetState returns the tasks next state given the set exit code and start
    81  // error. One of the following states are returned:
    82  // * TaskRestarting - Task should be restarted
    83  // * TaskNotRestarting - Task should not be restarted and has exceeded its
    84  //   restart policy.
    85  // * TaskTerminated - Task has terminated successfully and does not need a
    86  //   restart.
    87  //
    88  // If TaskRestarting is returned, the duration is how long to wait until
    89  // starting the task again.
    90  func (r *RestartTracker) GetState() (string, time.Duration) {
    91  	r.lock.Lock()
    92  	defer r.lock.Unlock()
    93  
    94  	// Hot path if no attempts are expected
    95  	if r.policy.Attempts == 0 {
    96  		r.reason = ReasonNoRestartsAllowed
    97  		if r.waitRes != nil && r.waitRes.Successful() {
    98  			return structs.TaskTerminated, 0
    99  		}
   100  
   101  		return structs.TaskNotRestarting, 0
   102  	}
   103  
   104  	r.count++
   105  
   106  	// Check if we have entered a new interval.
   107  	end := r.startTime.Add(r.policy.Interval)
   108  	now := time.Now()
   109  	if now.After(end) {
   110  		r.count = 0
   111  		r.startTime = now
   112  	}
   113  
   114  	if r.startErr != nil {
   115  		return r.handleStartError()
   116  	} else if r.waitRes != nil {
   117  		return r.handleWaitResult()
   118  	} else {
   119  		return "", 0
   120  	}
   121  }
   122  
   123  // handleStartError returns the new state and potential wait duration for
   124  // restarting the task after it was not successfully started. On start errors,
   125  // the restart policy is always treated as fail mode to ensure we don't
   126  // infinitely try to start a task.
   127  func (r *RestartTracker) handleStartError() (string, time.Duration) {
   128  	// If the error is not recoverable, do not restart.
   129  	if rerr, ok := r.startErr.(*cstructs.RecoverableError); !(ok && rerr.Recoverable) {
   130  		r.reason = ReasonUnrecoverableErrror
   131  		return structs.TaskNotRestarting, 0
   132  	}
   133  
   134  	if r.count > r.policy.Attempts {
   135  		r.reason = fmt.Sprintf("Exceeded allowed attempts %d in interval %v",
   136  			r.policy.Attempts, r.policy.Interval)
   137  		return structs.TaskNotRestarting, 0
   138  	}
   139  
   140  	r.reason = ReasonWithinPolicy
   141  	return structs.TaskRestarting, r.jitter()
   142  }
   143  
   144  // handleWaitResult returns the new state and potential wait duration for
   145  // restarting the task after it has exited.
   146  func (r *RestartTracker) handleWaitResult() (string, time.Duration) {
   147  	// If the task started successfully and restart on success isn't specified,
   148  	// don't restart but don't mark as failed.
   149  	if r.waitRes.Successful() && !r.onSuccess {
   150  		r.reason = "Restart unnecessary as task terminated successfully"
   151  		return structs.TaskTerminated, 0
   152  	}
   153  
   154  	if r.count > r.policy.Attempts {
   155  		if r.policy.Mode == structs.RestartPolicyModeFail {
   156  			r.reason = fmt.Sprintf(
   157  				`Exceeded allowed atttempts %d in interval %v and mode is "fail"`,
   158  				r.policy.Attempts, r.policy.Interval)
   159  			return structs.TaskNotRestarting, 0
   160  		} else {
   161  			r.reason = ReasonDelay
   162  			return structs.TaskRestarting, r.getDelay()
   163  		}
   164  	}
   165  
   166  	r.reason = ReasonWithinPolicy
   167  	return structs.TaskRestarting, r.jitter()
   168  }
   169  
   170  // getDelay returns the delay time to enter the next interval.
   171  func (r *RestartTracker) getDelay() time.Duration {
   172  	end := r.startTime.Add(r.policy.Interval)
   173  	now := time.Now()
   174  	return end.Sub(now)
   175  }
   176  
   177  // jitter returns the delay time plus a jitter.
   178  func (r *RestartTracker) jitter() time.Duration {
   179  	// Get the delay and ensure it is valid.
   180  	d := r.policy.Delay.Nanoseconds()
   181  	if d == 0 {
   182  		d = 1
   183  	}
   184  
   185  	j := float64(r.rand.Int63n(d)) * jitter
   186  	return time.Duration(d + int64(j))
   187  }
   188  
   189  // Returns a tracker that never restarts.
   190  func noRestartsTracker() *RestartTracker {
   191  	policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail}
   192  	return newRestartTracker(policy, structs.JobTypeBatch)
   193  }