github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/client/restarts.go

github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/client/restarts.go (about)

     1  package client
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"sync"
     7  	"time"
     8  
     9  	dstructs "github.com/hashicorp/nomad/client/driver/structs"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  const (
    14  	// jitter is the percent of jitter added to restart delays.
    15  	jitter = 0.25
    16  
    17  	ReasonNoRestartsAllowed   = "Policy allows no restarts"
    18  	ReasonUnrecoverableErrror = "Error was unrecoverable"
    19  	ReasonWithinPolicy        = "Restart within policy"
    20  	ReasonDelay               = "Exceeded allowed attempts, applying a delay"
    21  )
    22  
    23  func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker {
    24  	onSuccess := true
    25  	if jobType == structs.JobTypeBatch {
    26  		onSuccess = false
    27  	}
    28  	return &RestartTracker{
    29  		startTime: time.Now(),
    30  		onSuccess: onSuccess,
    31  		policy:    policy,
    32  		rand:      rand.New(rand.NewSource(time.Now().Unix())),
    33  	}
    34  }
    35  
    36  type RestartTracker struct {
    37  	waitRes          *dstructs.WaitResult
    38  	startErr         error
    39  	restartTriggered bool      // Whether the task has been signalled to be restarted
    40  	failure          bool      // Whether a failure triggered the restart
    41  	count            int       // Current number of attempts.
    42  	onSuccess        bool      // Whether to restart on successful exit code.
    43  	startTime        time.Time // When the interval began
    44  	reason           string    // The reason for the last state
    45  	policy           *structs.RestartPolicy
    46  	rand             *rand.Rand
    47  	lock             sync.Mutex
    48  }
    49  
    50  // SetPolicy updates the policy used to determine restarts.
    51  func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) {
    52  	r.lock.Lock()
    53  	defer r.lock.Unlock()
    54  	r.policy = policy
    55  }
    56  
    57  // SetStartError is used to mark the most recent start error. If starting was
    58  // successful the error should be nil.
    59  func (r *RestartTracker) SetStartError(err error) *RestartTracker {
    60  	r.lock.Lock()
    61  	defer r.lock.Unlock()
    62  	r.startErr = err
    63  	r.failure = true
    64  	return r
    65  }
    66  
    67  // SetWaitResult is used to mark the most recent wait result.
    68  func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker {
    69  	r.lock.Lock()
    70  	defer r.lock.Unlock()
    71  	r.waitRes = res
    72  	r.failure = true
    73  	return r
    74  }
    75  
    76  // SetRestartTriggered is used to mark that the task has been signalled to be
    77  // restarted. Setting the failure to true restarts according to the restart
    78  // policy. When failure is false the task is restarted without considering the
    79  // restart policy.
    80  func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker {
    81  	r.lock.Lock()
    82  	defer r.lock.Unlock()
    83  	if failure {
    84  		r.failure = true
    85  	} else {
    86  		r.restartTriggered = true
    87  	}
    88  	return r
    89  }
    90  
    91  // GetReason returns a human-readable description for the last state returned by
    92  // GetState.
    93  func (r *RestartTracker) GetReason() string {
    94  	r.lock.Lock()
    95  	defer r.lock.Unlock()
    96  	return r.reason
    97  }
    98  
    99  // GetState returns the tasks next state given the set exit code and start
   100  // error. One of the following states are returned:
   101  // * TaskRestarting - Task should be restarted
   102  // * TaskNotRestarting - Task should not be restarted and has exceeded its
   103  //   restart policy.
   104  // * TaskTerminated - Task has terminated successfully and does not need a
   105  //   restart.
   106  //
   107  // If TaskRestarting is returned, the duration is how long to wait until
   108  // starting the task again.
   109  func (r *RestartTracker) GetState() (string, time.Duration) {
   110  	r.lock.Lock()
   111  	defer r.lock.Unlock()
   112  
   113  	// Clear out the existing state
   114  	defer func() {
   115  		r.startErr = nil
   116  		r.waitRes = nil
   117  		r.restartTriggered = false
   118  		r.failure = false
   119  	}()
   120  
   121  	// Hot path if a restart was triggered
   122  	if r.restartTriggered {
   123  		r.reason = ""
   124  		return structs.TaskRestarting, 0
   125  	}
   126  
   127  	// Hot path if no attempts are expected
   128  	if r.policy.Attempts == 0 {
   129  		r.reason = ReasonNoRestartsAllowed
   130  		if r.waitRes != nil && r.waitRes.Successful() {
   131  			return structs.TaskTerminated, 0
   132  		}
   133  
   134  		return structs.TaskNotRestarting, 0
   135  	}
   136  
   137  	r.count++
   138  
   139  	// Check if we have entered a new interval.
   140  	end := r.startTime.Add(r.policy.Interval)
   141  	now := time.Now()
   142  	if now.After(end) {
   143  		r.count = 0
   144  		r.startTime = now
   145  	}
   146  
   147  	// Handle restarts due to failures
   148  	if !r.failure {
   149  		return "", 0
   150  	}
   151  
   152  	if r.startErr != nil {
   153  		// If the error is not recoverable, do not restart.
   154  		if !structs.IsRecoverable(r.startErr) {
   155  			r.reason = ReasonUnrecoverableErrror
   156  			return structs.TaskNotRestarting, 0
   157  		}
   158  	} else if r.waitRes != nil {
   159  		// If the task started successfully and restart on success isn't specified,
   160  		// don't restart but don't mark as failed.
   161  		if r.waitRes.Successful() && !r.onSuccess {
   162  			r.reason = "Restart unnecessary as task terminated successfully"
   163  			return structs.TaskTerminated, 0
   164  		}
   165  	}
   166  
   167  	// If this task has been restarted due to failures more times
   168  	// than the restart policy allows within an interval fail
   169  	// according to the restart policy's mode.
   170  	if r.count > r.policy.Attempts {
   171  		if r.policy.Mode == structs.RestartPolicyModeFail {
   172  			r.reason = fmt.Sprintf(
   173  				`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
   174  				r.policy.Attempts, r.policy.Interval)
   175  			return structs.TaskNotRestarting, 0
   176  		} else {
   177  			r.reason = ReasonDelay
   178  			return structs.TaskRestarting, r.getDelay()
   179  		}
   180  	}
   181  
   182  	r.reason = ReasonWithinPolicy
   183  	return structs.TaskRestarting, r.jitter()
   184  }
   185  
   186  // getDelay returns the delay time to enter the next interval.
   187  func (r *RestartTracker) getDelay() time.Duration {
   188  	end := r.startTime.Add(r.policy.Interval)
   189  	now := time.Now()
   190  	return end.Sub(now)
   191  }
   192  
   193  // jitter returns the delay time plus a jitter.
   194  func (r *RestartTracker) jitter() time.Duration {
   195  	// Get the delay and ensure it is valid.
   196  	d := r.policy.Delay.Nanoseconds()
   197  	if d == 0 {
   198  		d = 1
   199  	}
   200  
   201  	j := float64(r.rand.Int63n(d)) * jitter
   202  	return time.Duration(d + int64(j))
   203  }