github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/orchestrator/taskreaper/task_reaper.go (about)

     1  package taskreaper
     2  
     3  import (
     4  	"context"
     5  	"sort"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/docker/swarmkit/api"
    10  	"github.com/docker/swarmkit/log"
    11  	"github.com/docker/swarmkit/manager/orchestrator"
    12  	"github.com/docker/swarmkit/manager/state"
    13  	"github.com/docker/swarmkit/manager/state/store"
    14  )
    15  
    16  const (
    17  	// maxDirty is the size threshold for running a task pruning operation.
    18  	maxDirty = 1000
    19  	// reaperBatchingInterval is how often to prune old tasks.
    20  	reaperBatchingInterval = 250 * time.Millisecond
    21  )
    22  
    23  // A TaskReaper deletes old tasks when more than TaskHistoryRetentionLimit tasks
    24  // exist for the same service/instance or service/nodeid combination.
    25  type TaskReaper struct {
    26  	store *store.MemoryStore
    27  
    28  	// closeOnce ensures that stopChan is closed only once
    29  	closeOnce sync.Once
    30  
    31  	// taskHistory is the number of tasks to keep
    32  	taskHistory int64
    33  
    34  	// List of slot tuples to be inspected for task history cleanup.
    35  	dirty map[orchestrator.SlotTuple]struct{}
    36  
    37  	// List of tasks collected for cleanup, which includes two kinds of tasks
    38  	// - serviceless orphaned tasks
    39  	// - tasks with desired state REMOVE that have already been shut down
    40  	cleanup  []string
    41  	stopChan chan struct{}
    42  	doneChan chan struct{}
    43  
    44  	// tickSignal is a channel that, if non-nil and available, will be written
    45  	// to to signal that a tick has occurred. its sole purpose is for testing
    46  	// code, to verify that take cleanup attempts are happening when they
    47  	// should be.
    48  	tickSignal chan struct{}
    49  }
    50  
    51  // New creates a new TaskReaper.
    52  func New(store *store.MemoryStore) *TaskReaper {
    53  	return &TaskReaper{
    54  		store:    store,
    55  		dirty:    make(map[orchestrator.SlotTuple]struct{}),
    56  		stopChan: make(chan struct{}),
    57  		doneChan: make(chan struct{}),
    58  	}
    59  }
    60  
    61  // Run is the TaskReaper's watch loop which collects candidates for cleanup.
    62  // Task history is mainly used in task restarts but is also available for administrative purposes.
    63  // Note that the task history is stored per-slot-per-service for replicated services
    64  // and per-node-per-service for global services. History does not apply to serviceless tasks
    65  // since they are not attached to a service. In addition, the TaskReaper watch loop is also
    66  // responsible for cleaning up tasks associated with slots that were removed as part of
    67  // service scale down or service removal.
    68  func (tr *TaskReaper) Run(ctx context.Context) {
    69  	watcher, watchCancel := state.Watch(tr.store.WatchQueue(), api.EventCreateTask{}, api.EventUpdateTask{}, api.EventUpdateCluster{})
    70  
    71  	defer func() {
    72  		close(tr.doneChan)
    73  		watchCancel()
    74  	}()
    75  
    76  	var orphanedTasks []*api.Task
    77  	var removeTasks []*api.Task
    78  	tr.store.View(func(readTx store.ReadTx) {
    79  		var err error
    80  
    81  		clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
    82  		if err == nil && len(clusters) == 1 {
    83  			tr.taskHistory = clusters[0].Spec.Orchestration.TaskHistoryRetentionLimit
    84  		}
    85  
    86  		// On startup, scan the entire store and inspect orphaned tasks from previous life.
    87  		orphanedTasks, err = store.FindTasks(readTx, store.ByTaskState(api.TaskStateOrphaned))
    88  		if err != nil {
    89  			log.G(ctx).WithError(err).Error("failed to find Orphaned tasks in task reaper init")
    90  		}
    91  		removeTasks, err = store.FindTasks(readTx, store.ByDesiredState(api.TaskStateRemove))
    92  		if err != nil {
    93  			log.G(ctx).WithError(err).Error("failed to find tasks with desired state REMOVE in task reaper init")
    94  		}
    95  	})
    96  
    97  	if len(orphanedTasks)+len(removeTasks) > 0 {
    98  		for _, t := range orphanedTasks {
    99  			// Do not reap service tasks immediately.
   100  			// Let them go through the regular history cleanup process
   101  			// of checking TaskHistoryRetentionLimit.
   102  			if t.ServiceID != "" {
   103  				continue
   104  			}
   105  
   106  			// Serviceless tasks can be cleaned up right away since they are not attached to a service.
   107  			tr.cleanup = append(tr.cleanup, t.ID)
   108  		}
   109  		// tasks with desired state REMOVE that have progressed beyond COMPLETE or
   110  		// haven't been assigned yet can be cleaned up right away
   111  		for _, t := range removeTasks {
   112  			if t.Status.State < api.TaskStateAssigned || t.Status.State >= api.TaskStateCompleted {
   113  				tr.cleanup = append(tr.cleanup, t.ID)
   114  			}
   115  		}
   116  		// Clean up tasks in 'cleanup' right away
   117  		if len(tr.cleanup) > 0 {
   118  			tr.tick()
   119  		}
   120  	}
   121  
   122  	// Clean up when we hit TaskHistoryRetentionLimit or when the timer expires,
   123  	// whichever happens first.
   124  	//
   125  	// Specifically, the way this should work:
   126  	// - Create a timer and immediately stop it. We don't want to fire the
   127  	//   cleanup routine yet, because we just did a cleanup as part of the
   128  	//   initialization above.
   129  	// - Launch into an event loop
   130  	// - When we receive an event, handle the event as needed
   131  	// - After receiving the event:
   132  	//   - If minimum batch size (maxDirty) is exceeded with dirty + cleanup,
   133  	//     then immediately launch into the cleanup routine
   134  	//   - Otherwise, if the timer is stopped, start it (reset).
   135  	// - If the timer expires and the timer channel is signaled, then Stop the
   136  	//   timer (so that it will be ready to be started again as needed), and
   137  	//   execute the cleanup routine (tick)
   138  	timer := time.NewTimer(reaperBatchingInterval)
   139  	timer.Stop()
   140  
   141  	// If stop is somehow called AFTER the timer has expired, there will be a
   142  	// value in the timer.C channel. If there is such a value, we should drain
   143  	// it out. This select statement allows us to drain that value if it's
   144  	// present, or continue straight through otherwise.
   145  	select {
   146  	case <-timer.C:
   147  	default:
   148  	}
   149  
   150  	// keep track with a boolean of whether the timer is currently stopped
   151  	isTimerStopped := true
   152  
   153  	// Watch for:
   154  	// 1. EventCreateTask for cleaning slots, which is the best time to cleanup that node/slot.
   155  	// 2. EventUpdateTask for cleaning
   156  	//    - serviceless orphaned tasks (when orchestrator updates the task status to ORPHANED)
   157  	//    - tasks which have desired state REMOVE and have been shut down by the agent
   158  	//      (these are tasks which are associated with slots removed as part of service
   159  	//       remove or scale down)
   160  	// 3. EventUpdateCluster for TaskHistoryRetentionLimit update.
   161  	for {
   162  		select {
   163  		case event := <-watcher:
   164  			switch v := event.(type) {
   165  			case api.EventCreateTask:
   166  				t := v.Task
   167  				tr.dirty[orchestrator.SlotTuple{
   168  					Slot:      t.Slot,
   169  					ServiceID: t.ServiceID,
   170  					NodeID:    t.NodeID,
   171  				}] = struct{}{}
   172  			case api.EventUpdateTask:
   173  				t := v.Task
   174  				// add serviceless orphaned tasks
   175  				if t.Status.State >= api.TaskStateOrphaned && t.ServiceID == "" {
   176  					tr.cleanup = append(tr.cleanup, t.ID)
   177  				}
   178  				// add tasks that are yet unassigned or have progressed beyond COMPLETE, with
   179  				// desired state REMOVE. These tasks are associated with slots that were removed
   180  				// as part of a service scale down or service removal.
   181  				if t.DesiredState == api.TaskStateRemove && (t.Status.State < api.TaskStateAssigned || t.Status.State >= api.TaskStateCompleted) {
   182  					tr.cleanup = append(tr.cleanup, t.ID)
   183  				}
   184  			case api.EventUpdateCluster:
   185  				tr.taskHistory = v.Cluster.Spec.Orchestration.TaskHistoryRetentionLimit
   186  			}
   187  
   188  			if len(tr.dirty)+len(tr.cleanup) > maxDirty {
   189  				// stop the timer, so we don't fire it. if we get another event
   190  				// after we do this cleaning, we will reset the timer then
   191  				timer.Stop()
   192  				// if the timer had fired, drain out the value.
   193  				select {
   194  				case <-timer.C:
   195  				default:
   196  				}
   197  				isTimerStopped = true
   198  				tr.tick()
   199  			} else if isTimerStopped {
   200  				timer.Reset(reaperBatchingInterval)
   201  				isTimerStopped = false
   202  			}
   203  		case <-timer.C:
   204  			// we can safely ignore draining off of the timer channel, because
   205  			// we already know that the timer has expired.
   206  			isTimerStopped = true
   207  			tr.tick()
   208  		case <-tr.stopChan:
   209  			// even though this doesn't really matter in this context, it's
   210  			// good hygiene to drain the value.
   211  			timer.Stop()
   212  			select {
   213  			case <-timer.C:
   214  			default:
   215  			}
   216  			return
   217  		}
   218  	}
   219  }
   220  
   221  // taskInTerminalState returns true if task is in a terminal state.
   222  func taskInTerminalState(task *api.Task) bool {
   223  	return task.Status.State > api.TaskStateRunning
   224  }
   225  
   226  // taskWillNeverRun returns true if task will never reach running state.
   227  func taskWillNeverRun(task *api.Task) bool {
   228  	return task.Status.State < api.TaskStateAssigned && task.DesiredState > api.TaskStateRunning
   229  }
   230  
   231  // tick performs task history cleanup.
   232  func (tr *TaskReaper) tick() {
   233  	// this signals that a tick has occurred. it exists solely for testing.
   234  	if tr.tickSignal != nil {
   235  		// try writing to this channel, but if it's full, fall straight through
   236  		// and ignore it.
   237  		select {
   238  		case tr.tickSignal <- struct{}{}:
   239  		default:
   240  		}
   241  	}
   242  
   243  	if len(tr.dirty) == 0 && len(tr.cleanup) == 0 {
   244  		return
   245  	}
   246  
   247  	defer func() {
   248  		tr.cleanup = nil
   249  	}()
   250  
   251  	deleteTasks := make(map[string]struct{})
   252  	for _, tID := range tr.cleanup {
   253  		deleteTasks[tID] = struct{}{}
   254  	}
   255  
   256  	// Check history of dirty tasks for cleanup.
   257  	// Note: Clean out the dirty set at the end of this tick iteration
   258  	// in all but one scenarios (documented below).
   259  	// When tick() finishes, the tasks in the slot were either cleaned up,
   260  	// or it was skipped because it didn't meet the criteria for cleaning.
   261  	// Either way, we can discard the dirty set because future events on
   262  	// that slot will cause the task to be readded to the dirty set
   263  	// at that point.
   264  	//
   265  	// The only case when we keep the slot dirty is when there are more
   266  	// than one running tasks present for a given slot.
   267  	// In that case, we need to keep the slot dirty to allow it to be
   268  	// cleaned when tick() is called next and one or more the tasks
   269  	// in that slot have stopped running.
   270  	tr.store.View(func(tx store.ReadTx) {
   271  		for dirty := range tr.dirty {
   272  			service := store.GetService(tx, dirty.ServiceID)
   273  			if service == nil {
   274  				delete(tr.dirty, dirty)
   275  				continue
   276  			}
   277  
   278  			taskHistory := tr.taskHistory
   279  
   280  			// If MaxAttempts is set, keep at least one more than
   281  			// that number of tasks (this overrides TaskHistoryRetentionLimit).
   282  			// This is necessary to reconstruct restart history when the orchestrator starts up.
   283  			// TODO(aaronl): Consider hiding tasks beyond the normal
   284  			// retention limit in the UI.
   285  			// TODO(aaronl): There are some ways to cut down the
   286  			// number of retained tasks at the cost of more
   287  			// complexity:
   288  			//   - Don't force retention of tasks with an older spec
   289  			//     version.
   290  			//   - Don't force retention of tasks outside of the
   291  			//     time window configured for restart lookback.
   292  			if service.Spec.Task.Restart != nil && service.Spec.Task.Restart.MaxAttempts > 0 {
   293  				taskHistory = int64(service.Spec.Task.Restart.MaxAttempts) + 1
   294  			}
   295  
   296  			// Negative value for TaskHistoryRetentionLimit is an indication to never clean up task history.
   297  			if taskHistory < 0 {
   298  				delete(tr.dirty, dirty)
   299  				continue
   300  			}
   301  
   302  			var historicTasks []*api.Task
   303  
   304  			switch service.Spec.GetMode().(type) {
   305  			case *api.ServiceSpec_Replicated:
   306  				// Clean out the slot for which we received EventCreateTask.
   307  				var err error
   308  				historicTasks, err = store.FindTasks(tx, store.BySlot(dirty.ServiceID, dirty.Slot))
   309  				if err != nil {
   310  					continue
   311  				}
   312  
   313  			case *api.ServiceSpec_Global:
   314  				// Clean out the node history in case of global services.
   315  				tasksByNode, err := store.FindTasks(tx, store.ByNodeID(dirty.NodeID))
   316  				if err != nil {
   317  					continue
   318  				}
   319  
   320  				for _, t := range tasksByNode {
   321  					if t.ServiceID == dirty.ServiceID {
   322  						historicTasks = append(historicTasks, t)
   323  					}
   324  				}
   325  			}
   326  
   327  			if int64(len(historicTasks)) <= taskHistory {
   328  				delete(tr.dirty, dirty)
   329  				continue
   330  			}
   331  
   332  			// TODO(aaronl): This could filter for non-running tasks and use quickselect
   333  			// instead of sorting the whole slice.
   334  			// TODO(aaronl): This sort should really use lamport time instead of wall
   335  			// clock time. We should store a Version in the Status field.
   336  			sort.Sort(orchestrator.TasksByTimestamp(historicTasks))
   337  
   338  			runningTasks := 0
   339  			for _, t := range historicTasks {
   340  				// Historical tasks can be considered for cleanup if:
   341  				// 1. The task has reached a terminal state i.e. actual state beyond TaskStateRunning.
   342  				// 2. The task has not yet become running and desired state is a terminal state i.e.
   343  				// actual state not yet TaskStateAssigned and desired state beyond TaskStateRunning.
   344  				if taskInTerminalState(t) || taskWillNeverRun(t) {
   345  					deleteTasks[t.ID] = struct{}{}
   346  
   347  					taskHistory++
   348  					if int64(len(historicTasks)) <= taskHistory {
   349  						break
   350  					}
   351  				} else {
   352  					// all other tasks are counted as running.
   353  					runningTasks++
   354  				}
   355  			}
   356  
   357  			// The only case when we keep the slot dirty at the end of tick()
   358  			// is when there are more than one running tasks present
   359  			// for a given slot.
   360  			// In that case, we keep the slot dirty to allow it to be
   361  			// cleaned when tick() is called next and one or more of
   362  			// the tasks in that slot have stopped running.
   363  			if runningTasks <= 1 {
   364  				delete(tr.dirty, dirty)
   365  			}
   366  		}
   367  	})
   368  
   369  	// Perform cleanup.
   370  	if len(deleteTasks) > 0 {
   371  		tr.store.Batch(func(batch *store.Batch) error {
   372  			for taskID := range deleteTasks {
   373  				batch.Update(func(tx store.Tx) error {
   374  					return store.DeleteTask(tx, taskID)
   375  				})
   376  			}
   377  			return nil
   378  		})
   379  	}
   380  }
   381  
   382  // Stop stops the TaskReaper and waits for the main loop to exit.
   383  // Stop can be called in two cases. One when the manager is
   384  // shutting down, and the other when the manager (the leader) is
   385  // becoming a follower. Since these two instances could race with
   386  // each other, we use closeOnce here to ensure that TaskReaper.Stop()
   387  // is called only once to avoid a panic.
   388  func (tr *TaskReaper) Stop() {
   389  	tr.closeOnce.Do(func() {
   390  		close(tr.stopChan)
   391  	})
   392  	<-tr.doneChan
   393  }