github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/taskrunner/restarts/restarts.go

github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/taskrunner/restarts/restarts.go (about)

     1  package restarts
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/nomad/plugins/drivers"
    11  )
    12  
    13  const (
    14  	// jitter is the percent of jitter added to restart delays.
    15  	jitter = 0.25
    16  
    17  	ReasonNoRestartsAllowed   = "Policy allows no restarts"
    18  	ReasonUnrecoverableErrror = "Error was unrecoverable"
    19  	ReasonWithinPolicy        = "Restart within policy"
    20  	ReasonDelay               = "Exceeded allowed attempts, applying a delay"
    21  )
    22  
    23  func NewRestartTracker(policy *structs.RestartPolicy, jobType string, tlc *structs.TaskLifecycleConfig) *RestartTracker {
    24  	onSuccess := jobType != structs.JobTypeBatch
    25  	if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPrestart {
    26  		onSuccess = tlc.Sidecar
    27  	}
    28  
    29  	return &RestartTracker{
    30  		startTime: time.Now(),
    31  		onSuccess: onSuccess,
    32  		policy:    policy,
    33  		rand:      rand.New(rand.NewSource(time.Now().Unix())),
    34  	}
    35  }
    36  
    37  type RestartTracker struct {
    38  	exitRes          *drivers.ExitResult
    39  	startErr         error
    40  	killed           bool      // Whether the task has been killed
    41  	restartTriggered bool      // Whether the task has been signalled to be restarted
    42  	failure          bool      // Whether a failure triggered the restart
    43  	count            int       // Current number of attempts.
    44  	onSuccess        bool      // Whether to restart on successful exit code.
    45  	startTime        time.Time // When the interval began
    46  	reason           string    // The reason for the last state
    47  	policy           *structs.RestartPolicy
    48  	rand             *rand.Rand
    49  	lock             sync.Mutex
    50  }
    51  
    52  // SetPolicy updates the policy used to determine restarts.
    53  func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) {
    54  	r.lock.Lock()
    55  	defer r.lock.Unlock()
    56  	r.policy = policy
    57  }
    58  
    59  // GetPolicy returns a copy of the policy used to determine restarts.
    60  func (r *RestartTracker) GetPolicy() *structs.RestartPolicy {
    61  	r.lock.Lock()
    62  	defer r.lock.Unlock()
    63  	return r.policy.Copy()
    64  }
    65  
    66  // SetStartError is used to mark the most recent start error. If starting was
    67  // successful the error should be nil.
    68  func (r *RestartTracker) SetStartError(err error) *RestartTracker {
    69  	r.lock.Lock()
    70  	defer r.lock.Unlock()
    71  	r.startErr = err
    72  	r.failure = true
    73  	return r
    74  }
    75  
    76  // SetExitResult is used to mark the most recent wait result.
    77  func (r *RestartTracker) SetExitResult(res *drivers.ExitResult) *RestartTracker {
    78  	r.lock.Lock()
    79  	defer r.lock.Unlock()
    80  	r.exitRes = res
    81  	r.failure = true
    82  	return r
    83  }
    84  
    85  // SetRestartTriggered is used to mark that the task has been signalled to be
    86  // restarted. Setting the failure to true restarts according to the restart
    87  // policy. When failure is false the task is restarted without considering the
    88  // restart policy.
    89  func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker {
    90  	r.lock.Lock()
    91  	defer r.lock.Unlock()
    92  	if failure {
    93  		r.failure = true
    94  	} else {
    95  		r.restartTriggered = true
    96  	}
    97  	return r
    98  }
    99  
   100  // SetKilled is used to mark that the task has been killed.
   101  func (r *RestartTracker) SetKilled() *RestartTracker {
   102  	r.lock.Lock()
   103  	defer r.lock.Unlock()
   104  	r.killed = true
   105  	return r
   106  }
   107  
   108  // GetReason returns a human-readable description for the last state returned by
   109  // GetState.
   110  func (r *RestartTracker) GetReason() string {
   111  	r.lock.Lock()
   112  	defer r.lock.Unlock()
   113  	return r.reason
   114  }
   115  
   116  // GetCount returns the current restart count
   117  func (r *RestartTracker) GetCount() int {
   118  	r.lock.Lock()
   119  	defer r.lock.Unlock()
   120  	return r.count
   121  }
   122  
   123  // GetState returns the tasks next state given the set exit code and start
   124  // error. One of the following states are returned:
   125  // * TaskRestarting - Task should be restarted
   126  // * TaskNotRestarting - Task should not be restarted and has exceeded its
   127  //   restart policy.
   128  // * TaskTerminated - Task has terminated successfully and does not need a
   129  //   restart.
   130  //
   131  // If TaskRestarting is returned, the duration is how long to wait until
   132  // starting the task again.
   133  func (r *RestartTracker) GetState() (string, time.Duration) {
   134  	r.lock.Lock()
   135  	defer r.lock.Unlock()
   136  
   137  	// Clear out the existing state
   138  	defer func() {
   139  		r.startErr = nil
   140  		r.exitRes = nil
   141  		r.restartTriggered = false
   142  		r.failure = false
   143  		r.killed = false
   144  	}()
   145  
   146  	// Hot path if task was killed
   147  	if r.killed {
   148  		r.reason = ""
   149  		return structs.TaskKilled, 0
   150  	}
   151  
   152  	// Hot path if a restart was triggered
   153  	if r.restartTriggered {
   154  		r.reason = ""
   155  		return structs.TaskRestarting, 0
   156  	}
   157  
   158  	// Hot path if no attempts are expected
   159  	if r.policy.Attempts == 0 {
   160  		r.reason = ReasonNoRestartsAllowed
   161  
   162  		// If the task does not restart on a successful exit code and
   163  		// the exit code was successful: terminate.
   164  		if !r.onSuccess && r.exitRes != nil && r.exitRes.Successful() {
   165  			return structs.TaskTerminated, 0
   166  		}
   167  
   168  		// Task restarts even on a successful exit code but no restarts
   169  		// allowed.
   170  		return structs.TaskNotRestarting, 0
   171  	}
   172  
   173  	// Check if we have entered a new interval.
   174  	end := r.startTime.Add(r.policy.Interval)
   175  	now := time.Now()
   176  	if now.After(end) {
   177  		r.count = 0
   178  		r.startTime = now
   179  	}
   180  
   181  	r.count++
   182  
   183  	// Handle restarts due to failures
   184  	if !r.failure {
   185  		return "", 0
   186  	}
   187  
   188  	if r.startErr != nil {
   189  		// If the error is not recoverable, do not restart.
   190  		if !structs.IsRecoverable(r.startErr) {
   191  			r.reason = ReasonUnrecoverableErrror
   192  			return structs.TaskNotRestarting, 0
   193  		}
   194  	} else if r.exitRes != nil {
   195  		// If the task started successfully and restart on success isn't specified,
   196  		// don't restart but don't mark as failed.
   197  		if r.exitRes.Successful() && !r.onSuccess {
   198  			r.reason = "Restart unnecessary as task terminated successfully"
   199  			return structs.TaskTerminated, 0
   200  		}
   201  	}
   202  
   203  	// If this task has been restarted due to failures more times
   204  	// than the restart policy allows within an interval fail
   205  	// according to the restart policy's mode.
   206  	if r.count > r.policy.Attempts {
   207  		if r.policy.Mode == structs.RestartPolicyModeFail {
   208  			r.reason = fmt.Sprintf(
   209  				`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
   210  				r.policy.Attempts, r.policy.Interval)
   211  			return structs.TaskNotRestarting, 0
   212  		} else {
   213  			r.reason = ReasonDelay
   214  			return structs.TaskRestarting, r.getDelay()
   215  		}
   216  	}
   217  
   218  	r.reason = ReasonWithinPolicy
   219  	return structs.TaskRestarting, r.jitter()
   220  }
   221  
   222  // getDelay returns the delay time to enter the next interval.
   223  func (r *RestartTracker) getDelay() time.Duration {
   224  	end := r.startTime.Add(r.policy.Interval)
   225  	now := time.Now()
   226  	return end.Sub(now)
   227  }
   228  
   229  // jitter returns the delay time plus a jitter.
   230  func (r *RestartTracker) jitter() time.Duration {
   231  	// Get the delay and ensure it is valid.
   232  	d := r.policy.Delay.Nanoseconds()
   233  	if d == 0 {
   234  		d = 1
   235  	}
   236  
   237  	j := float64(r.rand.Int63n(d)) * jitter
   238  	return time.Duration(d + int64(j))
   239  }