github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/orchestrator/jobs/orchestrator.go

github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/orchestrator/jobs/orchestrator.go (about)

     1  package jobs
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  
     7  	"github.com/docker/go-events"
     8  
     9  	"github.com/docker/swarmkit/api"
    10  	"github.com/docker/swarmkit/log"
    11  	"github.com/docker/swarmkit/manager/orchestrator"
    12  	"github.com/docker/swarmkit/manager/orchestrator/jobs/global"
    13  	"github.com/docker/swarmkit/manager/orchestrator/jobs/replicated"
    14  	"github.com/docker/swarmkit/manager/orchestrator/restart"
    15  	"github.com/docker/swarmkit/manager/orchestrator/taskinit"
    16  	"github.com/docker/swarmkit/manager/state/store"
    17  )
    18  
    19  // Reconciler is the type that holds the reconciliation logic for the
    20  // orchestrator. It exists so that the logic of actually reconciling and
    21  // writing to the store is separated from the orchestrator, to make the event
    22  // handling logic in the orchestrator easier to test.
    23  type Reconciler interface {
    24  	taskinit.InitHandler
    25  
    26  	ReconcileService(id string) error
    27  }
    28  
    29  // Orchestrator is the combined orchestrator controlling both Global and
    30  // Replicated Jobs. Initially, these job types were two separate orchestrators,
    31  // like the Replicated and Global orchestrators. However, it became apparent
    32  // that because of the simplicity of Jobs as compared to Services, one combined
    33  // orchestrator suffices for both job types.
    34  type Orchestrator struct {
    35  	store *store.MemoryStore
    36  
    37  	// two reconcilers, one for each service type
    38  
    39  	replicatedReconciler Reconciler
    40  	globalReconciler     Reconciler
    41  
    42  	// startOnce is a function that stops the orchestrator from being started
    43  	// multiple times.
    44  	startOnce sync.Once
    45  
    46  	// restartSupervisor is the component that handles restarting tasks
    47  	restartSupervisor restart.SupervisorInterface
    48  
    49  	// stopChan is a channel that is closed to signal the orchestrator to stop
    50  	// running
    51  	stopChan chan struct{}
    52  	// stopOnce is used to ensure that stopChan can only be closed once, just
    53  	// in case some freak accident causes subsequent calls to Stop.
    54  	stopOnce sync.Once
    55  	// doneChan is closed when the orchestrator actually stops running
    56  	doneChan chan struct{}
    57  
    58  	// checkTasksFunc is a variable that hold taskinit.CheckTasks, but allows
    59  	// swapping it out in testing.
    60  	checkTasksFunc func(context.Context, *store.MemoryStore, store.ReadTx, taskinit.InitHandler, restart.SupervisorInterface) error
    61  
    62  	// the watchChan and watchCancel provide the event stream
    63  	watchChan   chan events.Event
    64  	watchCancel func()
    65  }
    66  
    67  func NewOrchestrator(store *store.MemoryStore) *Orchestrator {
    68  	return &Orchestrator{
    69  		store:    store,
    70  		stopChan: make(chan struct{}),
    71  		doneChan: make(chan struct{}),
    72  	}
    73  }
    74  
    75  // Run runs the Orchestrator reconciliation loop. It takes a context as an
    76  // argument, but canceling this context will not stop the routine; this context
    77  // is only for passing in logging information. Call Stop to stop the
    78  // Orchestrator
    79  func (o *Orchestrator) Run(ctx context.Context) {
    80  	o.startOnce.Do(func() { o.run(ctx) })
    81  }
    82  
    83  // init runs the once-off initialization logic for the orchestrator. This
    84  // includes initializing the sub-components, starting the channel watch, and
    85  // running the initial reconciliation pass. this runs as part of the run
    86  // method, but is broken out for the purpose of testing.
    87  func (o *Orchestrator) init(ctx context.Context) {
    88  	var (
    89  		services []*api.Service
    90  	)
    91  
    92  	// there are several components to the Orchestrator that are interfaces
    93  	// designed to be swapped out in testing. in production, these fields will
    94  	// all be unset, and be initialized here. in testing, we will set fakes,
    95  	// and this initialization will be skipped.
    96  
    97  	if o.restartSupervisor == nil {
    98  		o.restartSupervisor = restart.NewSupervisor(o.store)
    99  	}
   100  
   101  	if o.replicatedReconciler == nil {
   102  		// the cluster might be nil, but that doesn't matter.
   103  		o.replicatedReconciler = replicated.NewReconciler(o.store, o.restartSupervisor)
   104  	}
   105  
   106  	if o.globalReconciler == nil {
   107  		o.globalReconciler = global.NewReconciler(o.store, o.restartSupervisor)
   108  	}
   109  
   110  	if o.checkTasksFunc == nil {
   111  		o.checkTasksFunc = taskinit.CheckTasks
   112  	}
   113  
   114  	o.watchChan, o.watchCancel, _ = store.ViewAndWatch(o.store, func(tx store.ReadTx) error {
   115  		services, _ = store.FindServices(tx, store.All)
   116  		return nil
   117  	})
   118  
   119  	// checkTasksFunc is used to resume any in-progress restarts that were
   120  	// interrupted by a leadership change. In other orchestrators, this
   121  	// additionally queues up some tasks to be restarted. However, the jobs
   122  	// orchestrator will make a reconciliation pass across all services
   123  	// immediately after this, and so does not need to restart any tasks; they
   124  	// will be restarted during this pass.
   125  	//
   126  	// we cannot call o.checkTasksFunc inside of store.ViewAndWatch above.
   127  	// despite taking a callback with a ReadTx, it actually performs an Update,
   128  	// which acquires a lock and will result in a deadlock. instead, do
   129  	// o.checkTasksFunc here.
   130  	o.store.View(func(tx store.ReadTx) {
   131  		o.checkTasksFunc(ctx, o.store, tx, o.replicatedReconciler, o.restartSupervisor)
   132  		o.checkTasksFunc(ctx, o.store, tx, o.globalReconciler, o.restartSupervisor)
   133  	})
   134  
   135  	for _, service := range services {
   136  		if orchestrator.IsReplicatedJob(service) {
   137  			if err := o.replicatedReconciler.ReconcileService(service.ID); err != nil {
   138  				log.G(ctx).WithField(
   139  					"service.id", service.ID,
   140  				).WithError(err).Error("error reconciling replicated job")
   141  			}
   142  		}
   143  
   144  		if orchestrator.IsGlobalJob(service) {
   145  			if err := o.globalReconciler.ReconcileService(service.ID); err != nil {
   146  				log.G(ctx).WithField(
   147  					"service.id", service.ID,
   148  				).WithError(err).Error("error reconciling global job")
   149  			}
   150  		}
   151  	}
   152  }
   153  
   154  // run provides the actual meat of the the run operation. The call to run is
   155  // made inside of Run, and is enclosed in a sync.Once to stop this from being
   156  // called multiple times
   157  func (o *Orchestrator) run(ctx context.Context) {
   158  	ctx = log.WithModule(ctx, "orchestrator/jobs")
   159  
   160  	// closing doneChan should be the absolute last thing that happens in this
   161  	// method, and so should be the absolute first thing we defer.
   162  	defer close(o.doneChan)
   163  
   164  	o.init(ctx)
   165  	defer o.watchCancel()
   166  
   167  	for {
   168  		// first, before taking any action, see if we should stop the
   169  		// orchestrator. if both the stop channel and the watch channel are
   170  		// available to read, the channel that gets read is picked at random,
   171  		// but we always want to stop if it's possible.
   172  		select {
   173  		case <-o.stopChan:
   174  			return
   175  		default:
   176  		}
   177  
   178  		select {
   179  		case event := <-o.watchChan:
   180  			o.handleEvent(ctx, event)
   181  		case <-o.stopChan:
   182  			// we also need to check for stop in here, in case there are no
   183  			// updates to cause the loop to turn over.
   184  			return
   185  		}
   186  	}
   187  }
   188  
   189  // handle event does the logic of handling one event message and calling the
   190  // reconciler as needed. by handling the event logic in this function, we can
   191  // make an end-run around the run-loop and avoid being at the mercy of the go
   192  // scheduler when testing the orchestrator.
   193  func (o *Orchestrator) handleEvent(ctx context.Context, event events.Event) {
   194  	var (
   195  		service *api.Service
   196  		task    *api.Task
   197  	)
   198  
   199  	switch ev := event.(type) {
   200  	case api.EventCreateService:
   201  		service = ev.Service
   202  	case api.EventUpdateService:
   203  		service = ev.Service
   204  	case api.EventUpdateTask:
   205  		task = ev.Task
   206  	}
   207  
   208  	// if this is a task event, we should check if it means the service
   209  	// should be reconciled.
   210  	if task != nil {
   211  		// only bother with all this if the task has entered a terminal
   212  		// state and we don't want that to have happened.
   213  		if task.Status.State > api.TaskStateRunning && task.DesiredState <= api.TaskStateCompleted {
   214  			o.store.View(func(tx store.ReadTx) {
   215  				// if for any reason the service ID is invalid, then
   216  				// service will just be nil and nothing needs to be
   217  				// done
   218  				service = store.GetService(tx, task.ServiceID)
   219  			})
   220  		}
   221  	}
   222  
   223  	if orchestrator.IsReplicatedJob(service) {
   224  		if err := o.replicatedReconciler.ReconcileService(service.ID); err != nil {
   225  			log.G(ctx).WithField(
   226  				"service.id", service.ID,
   227  			).WithError(err).Error("error reconciling replicated job")
   228  		}
   229  	}
   230  
   231  	if orchestrator.IsGlobalJob(service) {
   232  		if err := o.globalReconciler.ReconcileService(service.ID); err != nil {
   233  			log.G(ctx).WithField(
   234  				"service.id", service.ID,
   235  			).WithError(err).Error("error reconciling global job")
   236  		}
   237  	}
   238  }
   239  
   240  // Stop stops the Orchestrator
   241  func (o *Orchestrator) Stop() {
   242  	// close stopChan inside of the Once so that there can be no races
   243  	// involving multiple attempts to close stopChan.
   244  	o.stopOnce.Do(func() {
   245  		close(o.stopChan)
   246  	})
   247  	// now, we wait for the Orchestrator to stop. this wait is unqualified; we
   248  	// will not return until Orchestrator has stopped successfully.
   249  	<-o.doneChan
   250  }