github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/taskrunner/restarts/restarts.go (about)

     1  package restarts
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/nomad/plugins/drivers"
    11  )
    12  
    13  const (
    14  	// jitter is the percent of jitter added to restart delays.
    15  	jitter = 0.25
    16  
    17  	ReasonNoRestartsAllowed  = "Policy allows no restarts"
    18  	ReasonUnrecoverableError = "Error was unrecoverable"
    19  	ReasonWithinPolicy       = "Restart within policy"
    20  	ReasonDelay              = "Exceeded allowed attempts, applying a delay"
    21  )
    22  
    23  func NewRestartTracker(policy *structs.RestartPolicy, jobType string, tlc *structs.TaskLifecycleConfig) *RestartTracker {
    24  	onSuccess := true
    25  
    26  	// Batch & SysBatch jobs should not restart if they exit successfully
    27  	if jobType == structs.JobTypeBatch || jobType == structs.JobTypeSysBatch {
    28  		onSuccess = false
    29  	}
    30  
    31  	// Prestart sidecars should get restarted on success
    32  	if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPrestart {
    33  		onSuccess = tlc.Sidecar
    34  	}
    35  
    36  	// Poststart sidecars should get restarted on success
    37  	if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPoststart {
    38  		onSuccess = tlc.Sidecar
    39  	}
    40  
    41  	// Poststop should never be restarted on success
    42  	if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPoststop {
    43  		onSuccess = false
    44  	}
    45  
    46  	return &RestartTracker{
    47  		startTime: time.Now(),
    48  		onSuccess: onSuccess,
    49  		policy:    policy,
    50  		rand:      rand.New(rand.NewSource(time.Now().Unix())),
    51  	}
    52  }
    53  
    54  type RestartTracker struct {
    55  	exitRes          *drivers.ExitResult
    56  	startErr         error
    57  	killed           bool      // Whether the task has been killed
    58  	restartTriggered bool      // Whether the task has been signalled to be restarted
    59  	failure          bool      // Whether a failure triggered the restart
    60  	count            int       // Current number of attempts.
    61  	onSuccess        bool      // Whether to restart on successful exit code.
    62  	startTime        time.Time // When the interval began
    63  	reason           string    // The reason for the last state
    64  	policy           *structs.RestartPolicy
    65  	rand             *rand.Rand
    66  	lock             sync.Mutex
    67  }
    68  
    69  // SetPolicy updates the policy used to determine restarts.
    70  func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) {
    71  	r.lock.Lock()
    72  	defer r.lock.Unlock()
    73  	r.policy = policy
    74  }
    75  
    76  // GetPolicy returns a copy of the policy used to determine restarts.
    77  func (r *RestartTracker) GetPolicy() *structs.RestartPolicy {
    78  	r.lock.Lock()
    79  	defer r.lock.Unlock()
    80  	return r.policy.Copy()
    81  }
    82  
    83  // SetStartError is used to mark the most recent start error. If starting was
    84  // successful the error should be nil.
    85  func (r *RestartTracker) SetStartError(err error) *RestartTracker {
    86  	r.lock.Lock()
    87  	defer r.lock.Unlock()
    88  	r.startErr = err
    89  	r.failure = true
    90  	return r
    91  }
    92  
    93  // SetExitResult is used to mark the most recent wait result.
    94  func (r *RestartTracker) SetExitResult(res *drivers.ExitResult) *RestartTracker {
    95  	r.lock.Lock()
    96  	defer r.lock.Unlock()
    97  	r.exitRes = res
    98  	r.failure = true
    99  	return r
   100  }
   101  
   102  // SetRestartTriggered is used to mark that the task has been signalled to be
   103  // restarted. Setting the failure to true restarts according to the restart
   104  // policy. When failure is false the task is restarted without considering the
   105  // restart policy.
   106  func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker {
   107  	r.lock.Lock()
   108  	defer r.lock.Unlock()
   109  	if failure {
   110  		r.failure = true
   111  	} else {
   112  		r.restartTriggered = true
   113  	}
   114  	return r
   115  }
   116  
   117  // SetKilled is used to mark that the task has been killed.
   118  func (r *RestartTracker) SetKilled() *RestartTracker {
   119  	r.lock.Lock()
   120  	defer r.lock.Unlock()
   121  	r.killed = true
   122  	return r
   123  }
   124  
   125  // GetReason returns a human-readable description for the last state returned by
   126  // GetState.
   127  func (r *RestartTracker) GetReason() string {
   128  	r.lock.Lock()
   129  	defer r.lock.Unlock()
   130  	return r.reason
   131  }
   132  
   133  // GetCount returns the current restart count
   134  func (r *RestartTracker) GetCount() int {
   135  	r.lock.Lock()
   136  	defer r.lock.Unlock()
   137  	return r.count
   138  }
   139  
   140  // GetState returns the tasks next state given the set exit code and start
   141  // error. One of the following states are returned:
   142  //   - TaskRestarting - Task should be restarted
   143  //   - TaskNotRestarting - Task should not be restarted and has exceeded its
   144  //     restart policy.
   145  //   - TaskTerminated - Task has terminated successfully and does not need a
   146  //     restart.
   147  //
   148  // If TaskRestarting is returned, the duration is how long to wait until
   149  // starting the task again.
   150  func (r *RestartTracker) GetState() (string, time.Duration) {
   151  	r.lock.Lock()
   152  	defer r.lock.Unlock()
   153  
   154  	// Clear out the existing state
   155  	defer func() {
   156  		r.startErr = nil
   157  		r.exitRes = nil
   158  		r.restartTriggered = false
   159  		r.failure = false
   160  		r.killed = false
   161  	}()
   162  
   163  	// Hot path if task was killed
   164  	if r.killed {
   165  		r.reason = ""
   166  		return structs.TaskKilled, 0
   167  	}
   168  
   169  	// Hot path if a restart was triggered
   170  	if r.restartTriggered {
   171  		r.reason = ""
   172  		return structs.TaskRestarting, 0
   173  	}
   174  
   175  	// Hot path if no attempts are expected
   176  	if r.policy.Attempts == 0 {
   177  		r.reason = ReasonNoRestartsAllowed
   178  
   179  		// If the task does not restart on a successful exit code and
   180  		// the exit code was successful: terminate.
   181  		if !r.onSuccess && r.exitRes != nil && r.exitRes.Successful() {
   182  			return structs.TaskTerminated, 0
   183  		}
   184  
   185  		// Task restarts even on a successful exit code but no restarts
   186  		// allowed.
   187  		return structs.TaskNotRestarting, 0
   188  	}
   189  
   190  	// Check if we have entered a new interval.
   191  	end := r.startTime.Add(r.policy.Interval)
   192  	now := time.Now()
   193  	if now.After(end) {
   194  		r.count = 0
   195  		r.startTime = now
   196  	}
   197  
   198  	r.count++
   199  
   200  	// Handle restarts due to failures
   201  	if !r.failure {
   202  		return "", 0
   203  	}
   204  
   205  	if r.startErr != nil {
   206  		// If the error is not recoverable, do not restart.
   207  		if !structs.IsRecoverable(r.startErr) {
   208  			r.reason = ReasonUnrecoverableError
   209  			return structs.TaskNotRestarting, 0
   210  		}
   211  	} else if r.exitRes != nil {
   212  		// If the task started successfully and restart on success isn't specified,
   213  		// don't restart but don't mark as failed.
   214  		if r.exitRes.Successful() && !r.onSuccess {
   215  			r.reason = "Restart unnecessary as task terminated successfully"
   216  			return structs.TaskTerminated, 0
   217  		}
   218  	}
   219  
   220  	// If this task has been restarted due to failures more times
   221  	// than the restart policy allows within an interval fail
   222  	// according to the restart policy's mode.
   223  	if r.count > r.policy.Attempts {
   224  		if r.policy.Mode == structs.RestartPolicyModeFail {
   225  			r.reason = fmt.Sprintf(
   226  				`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
   227  				r.policy.Attempts, r.policy.Interval)
   228  			return structs.TaskNotRestarting, 0
   229  		} else {
   230  			r.reason = ReasonDelay
   231  			return structs.TaskRestarting, r.getDelay()
   232  		}
   233  	}
   234  
   235  	r.reason = ReasonWithinPolicy
   236  	return structs.TaskRestarting, r.jitter()
   237  }
   238  
   239  // getDelay returns the delay time to enter the next interval.
   240  func (r *RestartTracker) getDelay() time.Duration {
   241  	end := r.startTime.Add(r.policy.Interval)
   242  	now := time.Now()
   243  	return end.Sub(now)
   244  }
   245  
   246  // jitter returns the delay time plus a jitter.
   247  func (r *RestartTracker) jitter() time.Duration {
   248  	// Get the delay and ensure it is valid.
   249  	d := r.policy.Delay.Nanoseconds()
   250  	if d == 0 {
   251  		d = 1
   252  	}
   253  
   254  	j := float64(r.rand.Int63n(d)) * jitter
   255  	return time.Duration(d + int64(j))
   256  }