github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/orchestrator/restart/restart.go (about)

     1  package restart
     2  
     3  import (
     4  	"container/list"
     5  	"context"
     6  	"errors"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/docker/go-events"
    11  	"github.com/docker/swarmkit/api"
    12  	"github.com/docker/swarmkit/api/defaults"
    13  	"github.com/docker/swarmkit/log"
    14  	"github.com/docker/swarmkit/manager/orchestrator"
    15  	"github.com/docker/swarmkit/manager/state"
    16  	"github.com/docker/swarmkit/manager/state/store"
    17  	gogotypes "github.com/gogo/protobuf/types"
    18  )
    19  
    20  const defaultOldTaskTimeout = time.Minute
    21  
    22  type restartedInstance struct {
    23  	timestamp time.Time
    24  }
    25  
    26  type instanceRestartInfo struct {
    27  	// counter of restarts for this instance.
    28  	totalRestarts uint64
    29  	// Linked list of restartedInstance structs. Only used when
    30  	// Restart.MaxAttempts and Restart.Window are both
    31  	// nonzero.
    32  	restartedInstances *list.List
    33  	// Why is specVersion in this structure and not in the map key? While
    34  	// putting it in the key would be a very simple solution, it wouldn't
    35  	// be easy to clean up map entries corresponding to old specVersions.
    36  	// Making the key version-agnostic and clearing the value whenever the
    37  	// version changes avoids the issue of stale map entries for old
    38  	// versions.
    39  	specVersion api.Version
    40  }
    41  
    42  type delayedStart struct {
    43  	// cancel is called to cancel the delayed start.
    44  	cancel func()
    45  	doneCh chan struct{}
    46  
    47  	// waiter is set to true if the next restart is waiting for this delay
    48  	// to complete.
    49  	waiter bool
    50  }
    51  
    52  // SupervisorInterface is an interface implemented by the Supervisor. It exists
    53  // to make testing easier, by allowing the restart supervisor to be mocked or
    54  // faked where desired.
    55  type SupervisorInterface interface {
    56  	Restart(context.Context, store.Tx, *api.Cluster, *api.Service, api.Task) error
    57  	UpdatableTasksInSlot(context.Context, orchestrator.Slot, *api.Service) orchestrator.Slot
    58  	RecordRestartHistory(orchestrator.SlotTuple, *api.Task)
    59  	DelayStart(context.Context, store.Tx, *api.Task, string, time.Duration, bool) <-chan struct{}
    60  	StartNow(store.Tx, string) error
    61  	Cancel(string)
    62  	CancelAll()
    63  	ClearServiceHistory(string)
    64  }
    65  
    66  // Supervisor initiates and manages restarts. It's responsible for
    67  // delaying restarts when applicable.
    68  type Supervisor struct {
    69  	mu               sync.Mutex
    70  	store            *store.MemoryStore
    71  	delays           map[string]*delayedStart
    72  	historyByService map[string]map[orchestrator.SlotTuple]*instanceRestartInfo
    73  	TaskTimeout      time.Duration
    74  }
    75  
    76  // NewSupervisor creates a new RestartSupervisor.
    77  func NewSupervisor(store *store.MemoryStore) *Supervisor {
    78  	return &Supervisor{
    79  		store:            store,
    80  		delays:           make(map[string]*delayedStart),
    81  		historyByService: make(map[string]map[orchestrator.SlotTuple]*instanceRestartInfo),
    82  		TaskTimeout:      defaultOldTaskTimeout,
    83  	}
    84  }
    85  
    86  func (r *Supervisor) waitRestart(ctx context.Context, oldDelay *delayedStart, cluster *api.Cluster, taskID string) {
    87  	// Wait for the last restart delay to elapse.
    88  	select {
    89  	case <-oldDelay.doneCh:
    90  	case <-ctx.Done():
    91  		return
    92  	}
    93  
    94  	// Start the next restart
    95  	err := r.store.Update(func(tx store.Tx) error {
    96  		t := store.GetTask(tx, taskID)
    97  		if t == nil {
    98  			return nil
    99  		}
   100  		if t.DesiredState > api.TaskStateRunning {
   101  			return nil
   102  		}
   103  		service := store.GetService(tx, t.ServiceID)
   104  		if service == nil {
   105  			return nil
   106  		}
   107  		return r.Restart(ctx, tx, cluster, service, *t)
   108  	})
   109  
   110  	if err != nil {
   111  		log.G(ctx).WithError(err).Errorf("failed to restart task after waiting for previous restart")
   112  	}
   113  }
   114  
   115  // Restart initiates a new task to replace t if appropriate under the service's
   116  // restart policy.
   117  func (r *Supervisor) Restart(ctx context.Context, tx store.Tx, cluster *api.Cluster, service *api.Service, t api.Task) error {
   118  	// TODO(aluzzardi): This function should not depend on `service`.
   119  
   120  	// Is the old task still in the process of restarting? If so, wait for
   121  	// its restart delay to elapse, to avoid tight restart loops (for
   122  	// example, when the image doesn't exist).
   123  	r.mu.Lock()
   124  	oldDelay, ok := r.delays[t.ID]
   125  	if ok {
   126  		if !oldDelay.waiter {
   127  			oldDelay.waiter = true
   128  			go r.waitRestart(ctx, oldDelay, cluster, t.ID)
   129  		}
   130  		r.mu.Unlock()
   131  		return nil
   132  	}
   133  	r.mu.Unlock()
   134  
   135  	// Sanity check: was the task shut down already by a separate call to
   136  	// Restart? If so, we must avoid restarting it, because this will create
   137  	// an extra task. This should never happen unless there is a bug.
   138  	if t.DesiredState > api.TaskStateCompleted {
   139  		return errors.New("Restart called on task that was already shut down")
   140  	}
   141  
   142  	t.DesiredState = api.TaskStateShutdown
   143  	err := store.UpdateTask(tx, &t)
   144  	if err != nil {
   145  		log.G(ctx).WithError(err).Errorf("failed to set task desired state to dead")
   146  		return err
   147  	}
   148  
   149  	if !r.shouldRestart(ctx, &t, service) {
   150  		return nil
   151  	}
   152  
   153  	var restartTask *api.Task
   154  
   155  	if orchestrator.IsReplicatedService(service) || orchestrator.IsReplicatedJob(service) {
   156  		restartTask = orchestrator.NewTask(cluster, service, t.Slot, "")
   157  	} else if orchestrator.IsGlobalService(service) || orchestrator.IsGlobalJob(service) {
   158  		restartTask = orchestrator.NewTask(cluster, service, 0, t.NodeID)
   159  	} else {
   160  		log.G(ctx).Error("service not supported by restart supervisor")
   161  		return nil
   162  	}
   163  
   164  	if orchestrator.IsReplicatedJob(service) || orchestrator.IsGlobalJob(service) {
   165  		restartTask.JobIteration = &api.Version{
   166  			Index: service.JobStatus.JobIteration.Index,
   167  		}
   168  	}
   169  
   170  	n := store.GetNode(tx, t.NodeID)
   171  
   172  	restartTask.DesiredState = api.TaskStateReady
   173  
   174  	var restartDelay time.Duration
   175  	// Restart delay is not applied to drained nodes
   176  	if n == nil || n.Spec.Availability != api.NodeAvailabilityDrain {
   177  		if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil {
   178  			var err error
   179  			restartDelay, err = gogotypes.DurationFromProto(t.Spec.Restart.Delay)
   180  			if err != nil {
   181  				log.G(ctx).WithError(err).Error("invalid restart delay; using default")
   182  				restartDelay, _ = gogotypes.DurationFromProto(defaults.Service.Task.Restart.Delay)
   183  			}
   184  		} else {
   185  			restartDelay, _ = gogotypes.DurationFromProto(defaults.Service.Task.Restart.Delay)
   186  		}
   187  	}
   188  
   189  	waitStop := true
   190  
   191  	// Normally we wait for the old task to stop running, but we skip this
   192  	// if the old task is already dead or the node it's assigned to is down.
   193  	if (n != nil && n.Status.State == api.NodeStatus_DOWN) || t.Status.State > api.TaskStateRunning {
   194  		waitStop = false
   195  	}
   196  
   197  	if err := store.CreateTask(tx, restartTask); err != nil {
   198  		log.G(ctx).WithError(err).WithField("task.id", restartTask.ID).Error("task create failed")
   199  		return err
   200  	}
   201  
   202  	tuple := orchestrator.SlotTuple{
   203  		Slot:      restartTask.Slot,
   204  		ServiceID: restartTask.ServiceID,
   205  		NodeID:    restartTask.NodeID,
   206  	}
   207  	r.RecordRestartHistory(tuple, restartTask)
   208  
   209  	r.DelayStart(ctx, tx, &t, restartTask.ID, restartDelay, waitStop)
   210  	return nil
   211  }
   212  
   213  // shouldRestart returns true if a task should be restarted according to the
   214  // restart policy.
   215  func (r *Supervisor) shouldRestart(ctx context.Context, t *api.Task, service *api.Service) bool {
   216  	// TODO(aluzzardi): This function should not depend on `service`.
   217  	// There are 3 possible restart policies.
   218  	switch orchestrator.RestartCondition(t) {
   219  	case api.RestartOnAny:
   220  		// we will be restarting, we just need to do a few more checks.
   221  		// however, if the task belongs to a job, then we will treat
   222  		// RestartOnAny the same as RestartOnFailure, as it would be
   223  		// nonsensical to restart completed jobs.
   224  		if orchestrator.IsReplicatedJob(service) || orchestrator.IsGlobalJob(service) {
   225  			// it'd be nice to put a fallthrough here, but we can't fallthrough
   226  			// from inside of an if statement.
   227  			if t.Status.State == api.TaskStateCompleted {
   228  				return false
   229  			}
   230  		}
   231  	case api.RestartOnFailure:
   232  		// we won't restart if the task is in TaskStateCompleted, as this is a
   233  		// not a failed state -- it indicates that the task exited with 0
   234  		if t.Status.State == api.TaskStateCompleted {
   235  			return false
   236  		}
   237  	case api.RestartOnNone:
   238  		// RestartOnNone means we just don't restart, ever
   239  		return false
   240  	}
   241  
   242  	if t.Spec.Restart == nil || t.Spec.Restart.MaxAttempts == 0 {
   243  		return true
   244  	}
   245  
   246  	instanceTuple := orchestrator.SlotTuple{
   247  		Slot:      t.Slot,
   248  		ServiceID: t.ServiceID,
   249  	}
   250  
   251  	// Slot is not meaningful for "global" tasks, so they need to be
   252  	// indexed by NodeID.
   253  	if orchestrator.IsGlobalService(service) {
   254  		instanceTuple.NodeID = t.NodeID
   255  	}
   256  
   257  	r.mu.Lock()
   258  	defer r.mu.Unlock()
   259  
   260  	restartInfo := r.historyByService[t.ServiceID][instanceTuple]
   261  	if restartInfo == nil || (t.SpecVersion != nil && *t.SpecVersion != restartInfo.specVersion) {
   262  		return true
   263  	}
   264  
   265  	if t.Spec.Restart.Window == nil || (t.Spec.Restart.Window.Seconds == 0 && t.Spec.Restart.Window.Nanos == 0) {
   266  		return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts
   267  	}
   268  
   269  	if restartInfo.restartedInstances == nil {
   270  		return true
   271  	}
   272  
   273  	window, err := gogotypes.DurationFromProto(t.Spec.Restart.Window)
   274  	if err != nil {
   275  		log.G(ctx).WithError(err).Error("invalid restart lookback window")
   276  		return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts
   277  	}
   278  
   279  	var timestamp time.Time
   280  	// Prefer the manager's timestamp over the agent's, since manager
   281  	// clocks are more trustworthy.
   282  	if t.Status.AppliedAt != nil {
   283  		timestamp, err = gogotypes.TimestampFromProto(t.Status.AppliedAt)
   284  		if err != nil {
   285  			log.G(ctx).WithError(err).Error("invalid task status AppliedAt timestamp")
   286  			return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts
   287  		}
   288  	} else {
   289  		// It's safe to call TimestampFromProto with a nil timestamp
   290  		timestamp, err = gogotypes.TimestampFromProto(t.Status.Timestamp)
   291  		if t.Status.Timestamp == nil || err != nil {
   292  			log.G(ctx).WithError(err).Error("invalid task completion timestamp")
   293  			return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts
   294  		}
   295  	}
   296  	lookback := timestamp.Add(-window)
   297  
   298  	numRestarts := uint64(restartInfo.restartedInstances.Len())
   299  
   300  	// Disregard any restarts that happened before the lookback window,
   301  	// and remove them from the linked list since they will no longer
   302  	// be relevant to figuring out if tasks should be restarted going
   303  	// forward.
   304  	var next *list.Element
   305  	for e := restartInfo.restartedInstances.Front(); e != nil; e = next {
   306  		next = e.Next()
   307  
   308  		if e.Value.(restartedInstance).timestamp.After(lookback) {
   309  			break
   310  		}
   311  		restartInfo.restartedInstances.Remove(e)
   312  		numRestarts--
   313  	}
   314  
   315  	// Ignore restarts that didn't happen before the task we're looking at.
   316  	for e2 := restartInfo.restartedInstances.Back(); e2 != nil; e2 = e2.Prev() {
   317  		if e2.Value.(restartedInstance).timestamp.Before(timestamp) {
   318  			break
   319  		}
   320  		numRestarts--
   321  	}
   322  
   323  	if restartInfo.restartedInstances.Len() == 0 {
   324  		restartInfo.restartedInstances = nil
   325  	}
   326  
   327  	return numRestarts < t.Spec.Restart.MaxAttempts
   328  }
   329  
   330  // UpdatableTasksInSlot returns the set of tasks that should be passed to the
   331  // updater from this slot, or an empty slice if none should be.  An updatable
   332  // slot has either at least one task that with desired state <= RUNNING, or its
   333  // most recent task has stopped running and should not be restarted. The latter
   334  // case is for making sure that tasks that shouldn't normally be restarted will
   335  // still be handled by rolling updates when they become outdated.  There is a
   336  // special case for rollbacks to make sure that a rollback always takes the
   337  // service to a converged state, instead of ignoring tasks with the original
   338  // spec that stopped running and shouldn't be restarted according to the
   339  // restart policy.
   340  func (r *Supervisor) UpdatableTasksInSlot(ctx context.Context, slot orchestrator.Slot, service *api.Service) orchestrator.Slot {
   341  	if len(slot) < 1 {
   342  		return nil
   343  	}
   344  
   345  	var updatable orchestrator.Slot
   346  	for _, t := range slot {
   347  		if t.DesiredState <= api.TaskStateRunning {
   348  			updatable = append(updatable, t)
   349  		}
   350  	}
   351  	if len(updatable) > 0 {
   352  		return updatable
   353  	}
   354  
   355  	if service.UpdateStatus != nil && service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED {
   356  		return nil
   357  	}
   358  
   359  	// Find most recent task
   360  	byTimestamp := orchestrator.TasksByTimestamp(slot)
   361  	newestIndex := 0
   362  	for i := 1; i != len(slot); i++ {
   363  		if byTimestamp.Less(newestIndex, i) {
   364  			newestIndex = i
   365  		}
   366  	}
   367  
   368  	if !r.shouldRestart(ctx, slot[newestIndex], service) {
   369  		return orchestrator.Slot{slot[newestIndex]}
   370  	}
   371  	return nil
   372  }
   373  
   374  // RecordRestartHistory updates the historyByService map to reflect the restart
   375  // of restartedTask.
   376  func (r *Supervisor) RecordRestartHistory(tuple orchestrator.SlotTuple, replacementTask *api.Task) {
   377  	if replacementTask.Spec.Restart == nil || replacementTask.Spec.Restart.MaxAttempts == 0 {
   378  		// No limit on the number of restarts, so no need to record
   379  		// history.
   380  		return
   381  	}
   382  
   383  	r.mu.Lock()
   384  	defer r.mu.Unlock()
   385  
   386  	serviceID := replacementTask.ServiceID
   387  	if r.historyByService[serviceID] == nil {
   388  		r.historyByService[serviceID] = make(map[orchestrator.SlotTuple]*instanceRestartInfo)
   389  	}
   390  	if r.historyByService[serviceID][tuple] == nil {
   391  		r.historyByService[serviceID][tuple] = &instanceRestartInfo{}
   392  	}
   393  
   394  	restartInfo := r.historyByService[serviceID][tuple]
   395  
   396  	if replacementTask.SpecVersion != nil && *replacementTask.SpecVersion != restartInfo.specVersion {
   397  		// This task has a different SpecVersion from the one we're
   398  		// tracking. Most likely, the service was updated. Past failures
   399  		// shouldn't count against the new service definition, so clear
   400  		// the history for this instance.
   401  		*restartInfo = instanceRestartInfo{
   402  			specVersion: *replacementTask.SpecVersion,
   403  		}
   404  	}
   405  
   406  	restartInfo.totalRestarts++
   407  
   408  	if replacementTask.Spec.Restart.Window != nil && (replacementTask.Spec.Restart.Window.Seconds != 0 || replacementTask.Spec.Restart.Window.Nanos != 0) {
   409  		if restartInfo.restartedInstances == nil {
   410  			restartInfo.restartedInstances = list.New()
   411  		}
   412  
   413  		// it's okay to call TimestampFromProto with a nil argument
   414  		timestamp, err := gogotypes.TimestampFromProto(replacementTask.Meta.CreatedAt)
   415  		if replacementTask.Meta.CreatedAt == nil || err != nil {
   416  			timestamp = time.Now()
   417  		}
   418  
   419  		restartedInstance := restartedInstance{
   420  			timestamp: timestamp,
   421  		}
   422  
   423  		restartInfo.restartedInstances.PushBack(restartedInstance)
   424  	}
   425  }
   426  
   427  // DelayStart starts a timer that moves the task from READY to RUNNING once:
   428  // - The restart delay has elapsed (if applicable)
   429  // - The old task that it's replacing has stopped running (or this times out)
   430  // It must be called during an Update transaction to ensure that it does not
   431  // miss events. The purpose of the store.Tx argument is to avoid accidental
   432  // calls outside an Update transaction.
   433  func (r *Supervisor) DelayStart(ctx context.Context, _ store.Tx, oldTask *api.Task, newTaskID string, delay time.Duration, waitStop bool) <-chan struct{} {
   434  	ctx, cancel := context.WithCancel(context.Background())
   435  	doneCh := make(chan struct{})
   436  
   437  	r.mu.Lock()
   438  	for {
   439  		oldDelay, ok := r.delays[newTaskID]
   440  		if !ok {
   441  			break
   442  		}
   443  		oldDelay.cancel()
   444  		r.mu.Unlock()
   445  		// Note that this channel read should only block for a very
   446  		// short time, because we cancelled the existing delay and
   447  		// that should cause it to stop immediately.
   448  		<-oldDelay.doneCh
   449  		r.mu.Lock()
   450  	}
   451  	r.delays[newTaskID] = &delayedStart{cancel: cancel, doneCh: doneCh}
   452  	r.mu.Unlock()
   453  
   454  	var watch chan events.Event
   455  	cancelWatch := func() {}
   456  
   457  	waitForTask := waitStop && oldTask != nil && oldTask.Status.State <= api.TaskStateRunning
   458  
   459  	if waitForTask {
   460  		// Wait for either the old task to complete, or the old task's
   461  		// node to become unavailable.
   462  		watch, cancelWatch = state.Watch(
   463  			r.store.WatchQueue(),
   464  			api.EventUpdateTask{
   465  				Task:   &api.Task{ID: oldTask.ID, Status: api.TaskStatus{State: api.TaskStateRunning}},
   466  				Checks: []api.TaskCheckFunc{api.TaskCheckID, state.TaskCheckStateGreaterThan},
   467  			},
   468  			api.EventUpdateNode{
   469  				Node:   &api.Node{ID: oldTask.NodeID, Status: api.NodeStatus{State: api.NodeStatus_DOWN}},
   470  				Checks: []api.NodeCheckFunc{api.NodeCheckID, state.NodeCheckState},
   471  			},
   472  			api.EventDeleteNode{
   473  				Node:   &api.Node{ID: oldTask.NodeID},
   474  				Checks: []api.NodeCheckFunc{api.NodeCheckID},
   475  			},
   476  		)
   477  	}
   478  
   479  	go func() {
   480  		defer func() {
   481  			cancelWatch()
   482  			r.mu.Lock()
   483  			delete(r.delays, newTaskID)
   484  			r.mu.Unlock()
   485  			close(doneCh)
   486  		}()
   487  
   488  		oldTaskTimer := time.NewTimer(r.TaskTimeout)
   489  		defer oldTaskTimer.Stop()
   490  
   491  		// Wait for the delay to elapse, if one is specified.
   492  		if delay != 0 {
   493  			select {
   494  			case <-time.After(delay):
   495  			case <-ctx.Done():
   496  				return
   497  			}
   498  		}
   499  
   500  		if waitForTask {
   501  			select {
   502  			case <-watch:
   503  			case <-oldTaskTimer.C:
   504  			case <-ctx.Done():
   505  				return
   506  			}
   507  		}
   508  
   509  		err := r.store.Update(func(tx store.Tx) error {
   510  			err := r.StartNow(tx, newTaskID)
   511  			if err != nil {
   512  				log.G(ctx).WithError(err).WithField("task.id", newTaskID).Error("moving task out of delayed state failed")
   513  			}
   514  			return nil
   515  		})
   516  		if err != nil {
   517  			log.G(ctx).WithError(err).WithField("task.id", newTaskID).Error("task restart transaction failed")
   518  		}
   519  	}()
   520  
   521  	return doneCh
   522  }
   523  
   524  // StartNow moves the task into the RUNNING state so it will proceed to start
   525  // up.
   526  func (r *Supervisor) StartNow(tx store.Tx, taskID string) error {
   527  	t := store.GetTask(tx, taskID)
   528  	if t == nil || t.DesiredState >= api.TaskStateRunning {
   529  		return nil
   530  	}
   531  
   532  	// only tasks belonging to jobs will have a JobIteration, so this can be
   533  	// used to distinguish whether this is a job task without looking at the
   534  	// service.
   535  	if t.JobIteration != nil {
   536  		t.DesiredState = api.TaskStateCompleted
   537  	} else {
   538  		t.DesiredState = api.TaskStateRunning
   539  	}
   540  	return store.UpdateTask(tx, t)
   541  }
   542  
   543  // Cancel cancels a pending restart.
   544  func (r *Supervisor) Cancel(taskID string) {
   545  	r.mu.Lock()
   546  	delay, ok := r.delays[taskID]
   547  	r.mu.Unlock()
   548  
   549  	if !ok {
   550  		return
   551  	}
   552  
   553  	delay.cancel()
   554  	<-delay.doneCh
   555  }
   556  
   557  // CancelAll aborts all pending restarts
   558  func (r *Supervisor) CancelAll() {
   559  	r.mu.Lock()
   560  	for _, delay := range r.delays {
   561  		delay.cancel()
   562  	}
   563  	r.mu.Unlock()
   564  }
   565  
   566  // ClearServiceHistory forgets restart history related to a given service ID.
   567  func (r *Supervisor) ClearServiceHistory(serviceID string) {
   568  	r.mu.Lock()
   569  	delete(r.historyByService, serviceID)
   570  	r.mu.Unlock()
   571  }