github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/orchestrator/jobs/replicated/reconciler.go (about)

     1  package replicated
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  
     7  	"github.com/docker/swarmkit/api"
     8  	"github.com/docker/swarmkit/manager/orchestrator"
     9  	"github.com/docker/swarmkit/manager/state/store"
    10  )
    11  
    12  // restartSupervisor is an interface representing the methods from the
    13  // restart.SupervisorInterface that are actually needed by the reconciler. This
    14  // more limited interface allows us to write a less ugly fake for unit testing.
    15  type restartSupervisor interface {
    16  	Restart(context.Context, store.Tx, *api.Cluster, *api.Service, api.Task) error
    17  }
    18  
    19  // Reconciler is an object that manages reconciliation of replicated jobs. It
    20  // is blocking and non-asynchronous, for ease of testing. It implements two
    21  // interfaces. The first is the Reconciler interface of the Orchestrator
    22  // package above this one. The second is the taskinit.InitHandler interface.
    23  type Reconciler struct {
    24  	// we need the store, of course, to do updates
    25  	store *store.MemoryStore
    26  
    27  	restart restartSupervisor
    28  }
    29  
    30  // newReconciler creates a new reconciler object
    31  func NewReconciler(store *store.MemoryStore, restart restartSupervisor) *Reconciler {
    32  	return &Reconciler{
    33  		store:   store,
    34  		restart: restart,
    35  	}
    36  }
    37  
    38  // ReconcileService reconciles the replicated job service with the given ID by
    39  // checking to see if new replicas should be created. reconcileService returns
    40  // an error if there is some case prevent it from correctly reconciling the
    41  // service.
    42  func (r *Reconciler) ReconcileService(id string) error {
    43  	var (
    44  		service *api.Service
    45  		tasks   []*api.Task
    46  		cluster *api.Cluster
    47  		viewErr error
    48  	)
    49  	// first, get the service and all of its tasks
    50  	r.store.View(func(tx store.ReadTx) {
    51  		service = store.GetService(tx, id)
    52  
    53  		tasks, viewErr = store.FindTasks(tx, store.ByServiceID(id))
    54  
    55  		// there should only ever be 1 cluster object, but for reasons
    56  		// forgotten by me, it needs to be retrieved in a rather roundabout way
    57  		// from the store
    58  		var clusters []*api.Cluster
    59  		clusters, viewErr = store.FindClusters(tx, store.All)
    60  		if len(clusters) == 1 {
    61  			cluster = clusters[0]
    62  		} else if len(clusters) > 1 {
    63  			// this should never happen, and indicates that the system is
    64  			// broken.
    65  			panic("there should never be more than one cluster object")
    66  		}
    67  	})
    68  
    69  	// errors during view should only happen in a few rather catastrophic
    70  	// cases, but here it's not unreasonable to just return an error anyway.
    71  	if viewErr != nil {
    72  		return viewErr
    73  	}
    74  
    75  	// if the service has already been deleted, there's nothing to do here.
    76  	if service == nil {
    77  		return nil
    78  	}
    79  
    80  	// if this is the first iteration of the service, it may not yet have a
    81  	// JobStatus, so we should create one if so. this won't actually be
    82  	// committed, though.
    83  	if service.JobStatus == nil {
    84  		service.JobStatus = &api.JobStatus{}
    85  	}
    86  
    87  	// Jobs can be run in multiple iterations. The JobStatus of the service
    88  	// indicates which Version of iteration we're on. We should only be looking
    89  	// at tasks of the latest Version
    90  
    91  	jobVersion := service.JobStatus.JobIteration.Index
    92  
    93  	// now, check how many tasks we need and how many we have running. note
    94  	// that some of these Running tasks may complete before we even finish this
    95  	// code block, and so we might have to immediately re-enter reconciliation,
    96  	// so this number is 100% definitive, but it is accurate for this
    97  	// particular moment in time, and it won't result in us going OVER the
    98  	// needed task count
    99  	//
   100  	// importantly, we are computing only how many _new_ tasks are needed. Some
   101  	// tasks may need to be restarted as well, but we don't do this directly;
   102  	// restarting tasks is under the purview of the restartSupervisor.
   103  	//
   104  	// also also, for the math later, we need these values to be of type uint64.
   105  	runningTasks := uint64(0)
   106  	completeTasks := uint64(0)
   107  	restartTasks := []string{}
   108  	removeTasks := []string{}
   109  
   110  	// for replicated jobs, each task will get a different slot number, so that
   111  	// when the job has completed, there will be one Completed task in every
   112  	// slot number [0, TotalCompletions-1].
   113  	//
   114  	// By assigning each task to a unique slot, we simply handling of
   115  	// restarting failed tasks through the restart manager.
   116  	slots := map[uint64]bool{}
   117  	for _, task := range tasks {
   118  		// we only care about tasks from this job iteration. tasks from the
   119  		// previous job iteration are not important
   120  		if task.JobIteration != nil {
   121  			if task.JobIteration.Index == jobVersion {
   122  				if task.Status.State == api.TaskStateCompleted {
   123  					completeTasks++
   124  					slots[task.Slot] = true
   125  				}
   126  
   127  				// the Restart Manager may put a task in the desired state Ready,
   128  				// so we should match not only tasks in desired state Completed,
   129  				// but also those in any valid running state.
   130  				if task.Status.State != api.TaskStateCompleted && task.DesiredState <= api.TaskStateCompleted {
   131  					runningTasks++
   132  					slots[task.Slot] = true
   133  
   134  					// if the task is in a terminal state, we might need to restart
   135  					// it. throw it on the pile if so. this is still counted as a
   136  					// running task for the purpose of determining how many new
   137  					// tasks to create.
   138  					if task.Status.State > api.TaskStateCompleted {
   139  						restartTasks = append(restartTasks, task.ID)
   140  					}
   141  				}
   142  			} else {
   143  				// tasks belonging to a previous iteration of the job may
   144  				// exist. if any such tasks exist, they should have their task
   145  				// state set to Remove
   146  				if task.Status.State <= api.TaskStateRunning && task.DesiredState != api.TaskStateRemove {
   147  					removeTasks = append(removeTasks, task.ID)
   148  				}
   149  			}
   150  		}
   151  	}
   152  
   153  	// now that we have our counts, we need to see how many new tasks to
   154  	// create. this number can never exceed MaxConcurrent, but also should not
   155  	// result in us exceeding TotalCompletions. first, get these numbers out of
   156  	// the service spec.
   157  	rj := service.Spec.GetReplicatedJob()
   158  
   159  	// possibleNewTasks gives us the upper bound for how many tasks we'll
   160  	// create. also, ugh, subtracting uints. there's no way this can ever go
   161  	// wrong.
   162  	possibleNewTasks := rj.MaxConcurrent - runningTasks
   163  
   164  	// allowedNewTasks is how many tasks we could create, if there were no
   165  	// restriction on maximum concurrency. This is the total number of tasks
   166  	// we want completed, minus the tasks that are already completed, minus
   167  	// the tasks that are in progress.
   168  	//
   169  	// seriously, ugh, subtracting unsigned ints. totally a fine and not at all
   170  	// risky operation, with no possibility for catastrophe
   171  	allowedNewTasks := rj.TotalCompletions - completeTasks - runningTasks
   172  
   173  	// the lower number of allowedNewTasks and possibleNewTasks is how many we
   174  	// can create. we'll just use an if statement instead of some fancy floor
   175  	// function.
   176  	actualNewTasks := allowedNewTasks
   177  	if possibleNewTasks < allowedNewTasks {
   178  		actualNewTasks = possibleNewTasks
   179  	}
   180  
   181  	// this check might seem odd, but it protects us from an underflow of the
   182  	// above subtractions, which, again, is a totally impossible thing that can
   183  	// never happen, ever, obviously.
   184  	if actualNewTasks > rj.TotalCompletions {
   185  		return fmt.Errorf(
   186  			"uint64 underflow, we're not going to create %v tasks",
   187  			actualNewTasks,
   188  		)
   189  	}
   190  
   191  	// finally, we can create these tasks. do this in a batch operation, to
   192  	// avoid exceeding transaction size limits
   193  	err := r.store.Batch(func(batch *store.Batch) error {
   194  		for i := uint64(0); i < actualNewTasks; i++ {
   195  			if err := batch.Update(func(tx store.Tx) error {
   196  				var slot uint64
   197  				// each task will go into a unique slot, and at the end, there
   198  				// should be the same number of slots as there are desired
   199  				// total completions. We could simplify this logic by simply
   200  				// assuming that slots are filled in order, but it's a more
   201  				// robust solution to not assume that, and instead assure that
   202  				// the slot is unoccupied.
   203  				for s := uint64(0); s < rj.TotalCompletions; s++ {
   204  					// when we're iterating through, if the service has slots
   205  					// that haven't been used yet (for example, if this is the
   206  					// first time we're running this iteration), then doing
   207  					// a map lookup for the number will return the 0-value
   208  					// (false) even if the number doesn't exist in the map.
   209  					if !slots[s] {
   210  						slot = s
   211  						// once we've found a slot, mark it as occupied, so we
   212  						// don't double assign in subsequent iterations.
   213  						slots[slot] = true
   214  						break
   215  					}
   216  				}
   217  
   218  				task := orchestrator.NewTask(cluster, service, slot, "")
   219  				// when we create the task, we also need to set the
   220  				// JobIteration.
   221  				task.JobIteration = &api.Version{Index: jobVersion}
   222  				task.DesiredState = api.TaskStateCompleted
   223  
   224  				// finally, create the task in the store.
   225  				return store.CreateTask(tx, task)
   226  			}); err != nil {
   227  				return err
   228  			}
   229  		}
   230  
   231  		for _, taskID := range restartTasks {
   232  			if err := batch.Update(func(tx store.Tx) error {
   233  				t := store.GetTask(tx, taskID)
   234  				if t == nil {
   235  					return nil
   236  				}
   237  
   238  				if t.DesiredState > api.TaskStateCompleted {
   239  					return nil
   240  				}
   241  
   242  				// TODO(dperny): pass in context from above
   243  				return r.restart.Restart(context.Background(), tx, cluster, service, *t)
   244  			}); err != nil {
   245  				return err
   246  			}
   247  		}
   248  
   249  		for _, taskID := range removeTasks {
   250  			if err := batch.Update(func(tx store.Tx) error {
   251  				t := store.GetTask(tx, taskID)
   252  				if t == nil {
   253  					return nil
   254  				}
   255  
   256  				// don't do unnecessary updates
   257  				if t.DesiredState == api.TaskStateRemove {
   258  					return nil
   259  				}
   260  				t.DesiredState = api.TaskStateRemove
   261  				return store.UpdateTask(tx, t)
   262  			}); err != nil {
   263  				return err
   264  			}
   265  		}
   266  
   267  		return nil
   268  	})
   269  
   270  	return err
   271  }
   272  
   273  // IsRelatedService returns true if the task is a replicated job. This method
   274  // fulfills the taskinit.InitHandler interface. Because it is just a wrapper
   275  // around a well-tested function call, it has no tests of its own.
   276  func (r *Reconciler) IsRelatedService(service *api.Service) bool {
   277  	return orchestrator.IsReplicatedJob(service)
   278  }
   279  
   280  // FixTask ostensibly validates that a task is compliant with the rest of the
   281  // cluster state. However, in the replicated jobs case, the only action we
   282  // can take with a noncompliant task is to restart it. Because the replicated
   283  // jobs orchestrator reconciles the whole service at once, any tasks that
   284  // need to be restarted will be done when we make the reconiliation pass over
   285  // all services. Therefore, in this instance, FixTask does nothing except
   286  // implement the FixTask method of the taskinit.InitHandler interface.
   287  func (r *Reconciler) FixTask(_ context.Context, _ *store.Batch, _ *api.Task) {}
   288  
   289  // SlotTuple returns an orchestrator.SlotTuple object for this task. It
   290  // implements the taskinit.InitHandler interface
   291  func (r *Reconciler) SlotTuple(t *api.Task) orchestrator.SlotTuple {
   292  	return orchestrator.SlotTuple{
   293  		ServiceID: t.ServiceID,
   294  		Slot:      t.Slot,
   295  	}
   296  }