github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allocrunner/taskrunner/restarts/restarts.go

github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allocrunner/taskrunner/restarts/restarts.go (about)

     1  package restarts
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/hashicorp/nomad/plugins/drivers"
    11  )
    12  
    13  const (
    14  	// jitter is the percent of jitter added to restart delays.
    15  	jitter = 0.25
    16  
    17  	ReasonNoRestartsAllowed   = "Policy allows no restarts"
    18  	ReasonUnrecoverableErrror = "Error was unrecoverable"
    19  	ReasonWithinPolicy        = "Restart within policy"
    20  	ReasonDelay               = "Exceeded allowed attempts, applying a delay"
    21  )
    22  
    23  func NewRestartTracker(policy *structs.RestartPolicy, jobType string, tlc *structs.TaskLifecycleConfig) *RestartTracker {
    24  	// Batch jobs should not restart if they exit successfully
    25  	onSuccess := jobType != structs.JobTypeBatch
    26  
    27  	// Prestart sidecars should get restarted on success
    28  	if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPrestart {
    29  		onSuccess = tlc.Sidecar
    30  	}
    31  
    32  	// Poststart sidecars should get restarted on success
    33  	if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPoststart {
    34  		onSuccess = tlc.Sidecar
    35  	}
    36  
    37  	// Poststop should never be restarted on success
    38  	if tlc != nil && tlc.Hook == structs.TaskLifecycleHookPoststop {
    39  		onSuccess = false
    40  	}
    41  
    42  	return &RestartTracker{
    43  		startTime: time.Now(),
    44  		onSuccess: onSuccess,
    45  		policy:    policy,
    46  		rand:      rand.New(rand.NewSource(time.Now().Unix())),
    47  	}
    48  }
    49  
    50  type RestartTracker struct {
    51  	exitRes          *drivers.ExitResult
    52  	startErr         error
    53  	killed           bool      // Whether the task has been killed
    54  	restartTriggered bool      // Whether the task has been signalled to be restarted
    55  	failure          bool      // Whether a failure triggered the restart
    56  	count            int       // Current number of attempts.
    57  	onSuccess        bool      // Whether to restart on successful exit code.
    58  	startTime        time.Time // When the interval began
    59  	reason           string    // The reason for the last state
    60  	policy           *structs.RestartPolicy
    61  	rand             *rand.Rand
    62  	lock             sync.Mutex
    63  }
    64  
    65  // SetPolicy updates the policy used to determine restarts.
    66  func (r *RestartTracker) SetPolicy(policy *structs.RestartPolicy) {
    67  	r.lock.Lock()
    68  	defer r.lock.Unlock()
    69  	r.policy = policy
    70  }
    71  
    72  // GetPolicy returns a copy of the policy used to determine restarts.
    73  func (r *RestartTracker) GetPolicy() *structs.RestartPolicy {
    74  	r.lock.Lock()
    75  	defer r.lock.Unlock()
    76  	return r.policy.Copy()
    77  }
    78  
    79  // SetStartError is used to mark the most recent start error. If starting was
    80  // successful the error should be nil.
    81  func (r *RestartTracker) SetStartError(err error) *RestartTracker {
    82  	r.lock.Lock()
    83  	defer r.lock.Unlock()
    84  	r.startErr = err
    85  	r.failure = true
    86  	return r
    87  }
    88  
    89  // SetExitResult is used to mark the most recent wait result.
    90  func (r *RestartTracker) SetExitResult(res *drivers.ExitResult) *RestartTracker {
    91  	r.lock.Lock()
    92  	defer r.lock.Unlock()
    93  	r.exitRes = res
    94  	r.failure = true
    95  	return r
    96  }
    97  
    98  // SetRestartTriggered is used to mark that the task has been signalled to be
    99  // restarted. Setting the failure to true restarts according to the restart
   100  // policy. When failure is false the task is restarted without considering the
   101  // restart policy.
   102  func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker {
   103  	r.lock.Lock()
   104  	defer r.lock.Unlock()
   105  	if failure {
   106  		r.failure = true
   107  	} else {
   108  		r.restartTriggered = true
   109  	}
   110  	return r
   111  }
   112  
   113  // SetKilled is used to mark that the task has been killed.
   114  func (r *RestartTracker) SetKilled() *RestartTracker {
   115  	r.lock.Lock()
   116  	defer r.lock.Unlock()
   117  	r.killed = true
   118  	return r
   119  }
   120  
   121  // GetReason returns a human-readable description for the last state returned by
   122  // GetState.
   123  func (r *RestartTracker) GetReason() string {
   124  	r.lock.Lock()
   125  	defer r.lock.Unlock()
   126  	return r.reason
   127  }
   128  
   129  // GetCount returns the current restart count
   130  func (r *RestartTracker) GetCount() int {
   131  	r.lock.Lock()
   132  	defer r.lock.Unlock()
   133  	return r.count
   134  }
   135  
   136  // GetState returns the tasks next state given the set exit code and start
   137  // error. One of the following states are returned:
   138  // * TaskRestarting - Task should be restarted
   139  // * TaskNotRestarting - Task should not be restarted and has exceeded its
   140  //   restart policy.
   141  // * TaskTerminated - Task has terminated successfully and does not need a
   142  //   restart.
   143  //
   144  // If TaskRestarting is returned, the duration is how long to wait until
   145  // starting the task again.
   146  func (r *RestartTracker) GetState() (string, time.Duration) {
   147  	r.lock.Lock()
   148  	defer r.lock.Unlock()
   149  
   150  	// Clear out the existing state
   151  	defer func() {
   152  		r.startErr = nil
   153  		r.exitRes = nil
   154  		r.restartTriggered = false
   155  		r.failure = false
   156  		r.killed = false
   157  	}()
   158  
   159  	// Hot path if task was killed
   160  	if r.killed {
   161  		r.reason = ""
   162  		return structs.TaskKilled, 0
   163  	}
   164  
   165  	// Hot path if a restart was triggered
   166  	if r.restartTriggered {
   167  		r.reason = ""
   168  		return structs.TaskRestarting, 0
   169  	}
   170  
   171  	// Hot path if no attempts are expected
   172  	if r.policy.Attempts == 0 {
   173  		r.reason = ReasonNoRestartsAllowed
   174  
   175  		// If the task does not restart on a successful exit code and
   176  		// the exit code was successful: terminate.
   177  		if !r.onSuccess && r.exitRes != nil && r.exitRes.Successful() {
   178  			return structs.TaskTerminated, 0
   179  		}
   180  
   181  		// Task restarts even on a successful exit code but no restarts
   182  		// allowed.
   183  		return structs.TaskNotRestarting, 0
   184  	}
   185  
   186  	// Check if we have entered a new interval.
   187  	end := r.startTime.Add(r.policy.Interval)
   188  	now := time.Now()
   189  	if now.After(end) {
   190  		r.count = 0
   191  		r.startTime = now
   192  	}
   193  
   194  	r.count++
   195  
   196  	// Handle restarts due to failures
   197  	if !r.failure {
   198  		return "", 0
   199  	}
   200  
   201  	if r.startErr != nil {
   202  		// If the error is not recoverable, do not restart.
   203  		if !structs.IsRecoverable(r.startErr) {
   204  			r.reason = ReasonUnrecoverableErrror
   205  			return structs.TaskNotRestarting, 0
   206  		}
   207  	} else if r.exitRes != nil {
   208  		// If the task started successfully and restart on success isn't specified,
   209  		// don't restart but don't mark as failed.
   210  		if r.exitRes.Successful() && !r.onSuccess {
   211  			r.reason = "Restart unnecessary as task terminated successfully"
   212  			return structs.TaskTerminated, 0
   213  		}
   214  	}
   215  
   216  	// If this task has been restarted due to failures more times
   217  	// than the restart policy allows within an interval fail
   218  	// according to the restart policy's mode.
   219  	if r.count > r.policy.Attempts {
   220  		if r.policy.Mode == structs.RestartPolicyModeFail {
   221  			r.reason = fmt.Sprintf(
   222  				`Exceeded allowed attempts %d in interval %v and mode is "fail"`,
   223  				r.policy.Attempts, r.policy.Interval)
   224  			return structs.TaskNotRestarting, 0
   225  		} else {
   226  			r.reason = ReasonDelay
   227  			return structs.TaskRestarting, r.getDelay()
   228  		}
   229  	}
   230  
   231  	r.reason = ReasonWithinPolicy
   232  	return structs.TaskRestarting, r.jitter()
   233  }
   234  
   235  // getDelay returns the delay time to enter the next interval.
   236  func (r *RestartTracker) getDelay() time.Duration {
   237  	end := r.startTime.Add(r.policy.Interval)
   238  	now := time.Now()
   239  	return end.Sub(now)
   240  }
   241  
   242  // jitter returns the delay time plus a jitter.
   243  func (r *RestartTracker) jitter() time.Duration {
   244  	// Get the delay and ensure it is valid.
   245  	d := r.policy.Delay.Nanoseconds()
   246  	if d == 0 {
   247  		d = 1
   248  	}
   249  
   250  	j := float64(r.rand.Int63n(d)) * jitter
   251  	return time.Duration(d + int64(j))
   252  }