github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/restarts.go (about)

     1  package client
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"sync"
     7  	"time"
     8  
     9  	dstructs "github.com/hashicorp/nomad/client/driver/structs"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  const (
    14  	// jitter is the percent of jitter added to restart delays.
    15  	jitter = 0.25
    16  
    17  	ReasonNoRestartsAllowed   = "Policy allows no restarts"
    18  	ReasonUnrecoverableErrror = "Error was unrecoverable"
    19  	ReasonWithinPolicy        = "Restart within policy"
    20  	ReasonDelay               = "Exceeded allowed attempts, applying a delay"
    21  )
    22  
    23  func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
    24  	onSuccess := true
    25  	if jobType == structs.JobTypeBatch {
    26  		onSuccess = false
    27  	}
    28  	return &RestartTracker{
    29  		startTime: time.Now(),
    30  		onSuccess: onSuccess,
    31  		policy:    policy,
    32  		rand:      rand.New(rand.NewSource(time.Now().Unix())),
    33  	}
    34  }
    35  
    36  type RestartTracker struct {
    37  	waitRes          *dstructs.WaitResult
    38  	startErr         error
    39  	restartTriggered bool      // Whether the task has been signalled to be restarted
    40  	count            int       // Current number of attempts.
    41  	onSuccess        bool      // Whether to restart on successful exit code.
    42  	startTime        time.Time // When the interval began
    43  	reason           string    // The reason for the last state
    44  	policy           *structs.RestartPolicy
    45  	rand             *rand.Rand
    46  	lock             sync.Mutex
    47  }
    48  
    49  // SetPolicy updates the policy used to determine restarts.
    50  func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) {
    51  	r.lock.Lock()
    52  	defer r.lock.Unlock()
    53  	r.policy = policy
    54  }
    55  
    56  // SetStartError is used to mark the most recent start error. If starting was
    57  // successful the error should be nil.
    58  func (r *RestartTracker) SetStartError(err error) *RestartTracker {
    59  	r.lock.Lock()
    60  	defer r.lock.Unlock()
    61  	r.startErr = err
    62  	return r
    63  }
    64  
    65  // SetWaitResult is used to mark the most recent wait result.
    66  func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker {
    67  	r.lock.Lock()
    68  	defer r.lock.Unlock()
    69  	r.waitRes = res
    70  	return r
    71  }
    72  
    73  // SetRestartTriggered is used to mark that the task has been signalled to be
    74  // restarted
    75  func (r *RestartTracker) SetRestartTriggered() *RestartTracker {
    76  	r.lock.Lock()
    77  	defer r.lock.Unlock()
    78  	r.restartTriggered = true
    79  	return r
    80  }
    81  
    82  // GetReason returns a human-readable description for the last state returned by
    83  // GetState.
    84  func (r *RestartTracker) GetReason() string {
    85  	r.lock.Lock()
    86  	defer r.lock.Unlock()
    87  	return r.reason
    88  }
    89  
    90  // GetState returns the tasks next state given the set exit code and start
    91  // error. One of the following states are returned:
    92  // * TaskRestarting - Task should be restarted
    93  // * TaskNotRestarting - Task should not be restarted and has exceeded its
    94  //   restart policy.
    95  // * TaskTerminated - Task has terminated successfully and does not need a
    96  //   restart.
    97  //
    98  // If TaskRestarting is returned, the duration is how long to wait until
    99  // starting the task again.
   100  func (r *RestartTracker) GetState() (string, time.Duration) {
   101  	r.lock.Lock()
   102  	defer r.lock.Unlock()
   103  
   104  	// Clear out the existing state
   105  	defer func() {
   106  		r.startErr = nil
   107  		r.waitRes = nil
   108  		r.restartTriggered = false
   109  	}()
   110  
   111  	// Hot path if a restart was triggered
   112  	if r.restartTriggered {
   113  		r.reason = ""
   114  		return structs.TaskRestarting, 0
   115  	}
   116  
   117  	// Hot path if no attempts are expected
   118  	if r.policy.Attempts == 0 {
   119  		r.reason = ReasonNoRestartsAllowed
   120  		if r.waitRes != nil && r.waitRes.Successful() {
   121  			return structs.TaskTerminated, 0
   122  		}
   123  
   124  		return structs.TaskNotRestarting, 0
   125  	}
   126  
   127  	r.count++
   128  
   129  	// Check if we have entered a new interval.
   130  	end := r.startTime.Add(r.policy.Interval)
   131  	now := time.Now()
   132  	if now.After(end) {
   133  		r.count = 0
   134  		r.startTime = now
   135  	}
   136  
   137  	if r.startErr != nil {
   138  		return r.handleStartError()
   139  	} else if r.waitRes != nil {
   140  		return r.handleWaitResult()
   141  	}
   142  
   143  	return "", 0
   144  }
   145  
   146  // handleStartError returns the new state and potential wait duration for
   147  // restarting the task after it was not successfully started. On start errors,
   148  // the restart policy is always treated as fail mode to ensure we don't
   149  // infinitely try to start a task.
   150  func (r *RestartTracker) handleStartError() (string, time.Duration) {
   151  	// If the error is not recoverable, do not restart.
   152  	if !structs.IsRecoverable(r.startErr) {
   153  		r.reason = ReasonUnrecoverableErrror
   154  		return structs.TaskNotRestarting, 0
   155  	}
   156  
   157  	if r.count > r.policy.Attempts {
   158  		if r.policy.Mode == structs.RestartPolicyModeFail {
   159  			r.reason = fmt.Sprintf(
   160  				`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
   161  				r.policy.Attempts, r.policy.Interval)
   162  			return structs.TaskNotRestarting, 0
   163  		} else {
   164  			r.reason = ReasonDelay
   165  			return structs.TaskRestarting, r.getDelay()
   166  		}
   167  	}
   168  
   169  	r.reason = ReasonWithinPolicy
   170  	return structs.TaskRestarting, r.jitter()
   171  }
   172  
   173  // handleWaitResult returns the new state and potential wait duration for
   174  // restarting the task after it has exited.
   175  func (r *RestartTracker) handleWaitResult() (string, time.Duration) {
   176  	// If the task started successfully and restart on success isn't specified,
   177  	// don't restart but don't mark as failed.
   178  	if r.waitRes.Successful() && !r.onSuccess {
   179  		r.reason = "Restart unnecessary as task terminated successfully"
   180  		return structs.TaskTerminated, 0
   181  	}
   182  
   183  	if r.count > r.policy.Attempts {
   184  		if r.policy.Mode == structs.RestartPolicyModeFail {
   185  			r.reason = fmt.Sprintf(
   186  				`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
   187  				r.policy.Attempts, r.policy.Interval)
   188  			return structs.TaskNotRestarting, 0
   189  		} else {
   190  			r.reason = ReasonDelay
   191  			return structs.TaskRestarting, r.getDelay()
   192  		}
   193  	}
   194  
   195  	r.reason = ReasonWithinPolicy
   196  	return structs.TaskRestarting, r.jitter()
   197  }
   198  
   199  // getDelay returns the delay time to enter the next interval.
   200  func (r *RestartTracker) getDelay() time.Duration {
   201  	end := r.startTime.Add(r.policy.Interval)
   202  	now := time.Now()
   203  	return end.Sub(now)
   204  }
   205  
   206  // jitter returns the delay time plus a jitter.
   207  func (r *RestartTracker) jitter() time.Duration {
   208  	// Get the delay and ensure it is valid.
   209  	d := r.policy.Delay.Nanoseconds()
   210  	if d == 0 {
   211  		d = 1
   212  	}
   213  
   214  	j := float64(r.rand.Int63n(d)) * jitter
   215  	return time.Duration(d + int64(j))
   216  }
   217  
   218  // Returns a tracker that never restarts.
   219  func noRestartsTracker() *RestartTracker {
   220  	policy := &structs.RestartPolicy{Attempts: 0, Mode: structs.RestartPolicyModeFail}
   221  	return newRestartTracker(policy, structs.JobTypeBatch)
   222  }