github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/orchestrator/jobs/orchestrator.go (about) 1 package jobs 2 3 import ( 4 "context" 5 "sync" 6 7 "github.com/docker/go-events" 8 9 "github.com/docker/swarmkit/api" 10 "github.com/docker/swarmkit/log" 11 "github.com/docker/swarmkit/manager/orchestrator" 12 "github.com/docker/swarmkit/manager/orchestrator/jobs/global" 13 "github.com/docker/swarmkit/manager/orchestrator/jobs/replicated" 14 "github.com/docker/swarmkit/manager/orchestrator/restart" 15 "github.com/docker/swarmkit/manager/orchestrator/taskinit" 16 "github.com/docker/swarmkit/manager/state/store" 17 ) 18 19 // Reconciler is the type that holds the reconciliation logic for the 20 // orchestrator. It exists so that the logic of actually reconciling and 21 // writing to the store is separated from the orchestrator, to make the event 22 // handling logic in the orchestrator easier to test. 23 type Reconciler interface { 24 taskinit.InitHandler 25 26 ReconcileService(id string) error 27 } 28 29 // Orchestrator is the combined orchestrator controlling both Global and 30 // Replicated Jobs. Initially, these job types were two separate orchestrators, 31 // like the Replicated and Global orchestrators. However, it became apparent 32 // that because of the simplicity of Jobs as compared to Services, one combined 33 // orchestrator suffices for both job types. 34 type Orchestrator struct { 35 store *store.MemoryStore 36 37 // two reconcilers, one for each service type 38 39 replicatedReconciler Reconciler 40 globalReconciler Reconciler 41 42 // startOnce is a function that stops the orchestrator from being started 43 // multiple times. 44 startOnce sync.Once 45 46 // restartSupervisor is the component that handles restarting tasks 47 restartSupervisor restart.SupervisorInterface 48 49 // stopChan is a channel that is closed to signal the orchestrator to stop 50 // running 51 stopChan chan struct{} 52 // stopOnce is used to ensure that stopChan can only be closed once, just 53 // in case some freak accident causes subsequent calls to Stop. 54 stopOnce sync.Once 55 // doneChan is closed when the orchestrator actually stops running 56 doneChan chan struct{} 57 58 // checkTasksFunc is a variable that hold taskinit.CheckTasks, but allows 59 // swapping it out in testing. 60 checkTasksFunc func(context.Context, *store.MemoryStore, store.ReadTx, taskinit.InitHandler, restart.SupervisorInterface) error 61 62 // the watchChan and watchCancel provide the event stream 63 watchChan chan events.Event 64 watchCancel func() 65 } 66 67 func NewOrchestrator(store *store.MemoryStore) *Orchestrator { 68 return &Orchestrator{ 69 store: store, 70 stopChan: make(chan struct{}), 71 doneChan: make(chan struct{}), 72 } 73 } 74 75 // Run runs the Orchestrator reconciliation loop. It takes a context as an 76 // argument, but canceling this context will not stop the routine; this context 77 // is only for passing in logging information. Call Stop to stop the 78 // Orchestrator 79 func (o *Orchestrator) Run(ctx context.Context) { 80 o.startOnce.Do(func() { o.run(ctx) }) 81 } 82 83 // init runs the once-off initialization logic for the orchestrator. This 84 // includes initializing the sub-components, starting the channel watch, and 85 // running the initial reconciliation pass. this runs as part of the run 86 // method, but is broken out for the purpose of testing. 87 func (o *Orchestrator) init(ctx context.Context) { 88 var ( 89 services []*api.Service 90 ) 91 92 // there are several components to the Orchestrator that are interfaces 93 // designed to be swapped out in testing. in production, these fields will 94 // all be unset, and be initialized here. in testing, we will set fakes, 95 // and this initialization will be skipped. 96 97 if o.restartSupervisor == nil { 98 o.restartSupervisor = restart.NewSupervisor(o.store) 99 } 100 101 if o.replicatedReconciler == nil { 102 // the cluster might be nil, but that doesn't matter. 103 o.replicatedReconciler = replicated.NewReconciler(o.store, o.restartSupervisor) 104 } 105 106 if o.globalReconciler == nil { 107 o.globalReconciler = global.NewReconciler(o.store, o.restartSupervisor) 108 } 109 110 if o.checkTasksFunc == nil { 111 o.checkTasksFunc = taskinit.CheckTasks 112 } 113 114 o.watchChan, o.watchCancel, _ = store.ViewAndWatch(o.store, func(tx store.ReadTx) error { 115 services, _ = store.FindServices(tx, store.All) 116 return nil 117 }) 118 119 // checkTasksFunc is used to resume any in-progress restarts that were 120 // interrupted by a leadership change. In other orchestrators, this 121 // additionally queues up some tasks to be restarted. However, the jobs 122 // orchestrator will make a reconciliation pass across all services 123 // immediately after this, and so does not need to restart any tasks; they 124 // will be restarted during this pass. 125 // 126 // we cannot call o.checkTasksFunc inside of store.ViewAndWatch above. 127 // despite taking a callback with a ReadTx, it actually performs an Update, 128 // which acquires a lock and will result in a deadlock. instead, do 129 // o.checkTasksFunc here. 130 o.store.View(func(tx store.ReadTx) { 131 o.checkTasksFunc(ctx, o.store, tx, o.replicatedReconciler, o.restartSupervisor) 132 o.checkTasksFunc(ctx, o.store, tx, o.globalReconciler, o.restartSupervisor) 133 }) 134 135 for _, service := range services { 136 if orchestrator.IsReplicatedJob(service) { 137 if err := o.replicatedReconciler.ReconcileService(service.ID); err != nil { 138 log.G(ctx).WithField( 139 "service.id", service.ID, 140 ).WithError(err).Error("error reconciling replicated job") 141 } 142 } 143 144 if orchestrator.IsGlobalJob(service) { 145 if err := o.globalReconciler.ReconcileService(service.ID); err != nil { 146 log.G(ctx).WithField( 147 "service.id", service.ID, 148 ).WithError(err).Error("error reconciling global job") 149 } 150 } 151 } 152 } 153 154 // run provides the actual meat of the the run operation. The call to run is 155 // made inside of Run, and is enclosed in a sync.Once to stop this from being 156 // called multiple times 157 func (o *Orchestrator) run(ctx context.Context) { 158 ctx = log.WithModule(ctx, "orchestrator/jobs") 159 160 // closing doneChan should be the absolute last thing that happens in this 161 // method, and so should be the absolute first thing we defer. 162 defer close(o.doneChan) 163 164 o.init(ctx) 165 defer o.watchCancel() 166 167 for { 168 // first, before taking any action, see if we should stop the 169 // orchestrator. if both the stop channel and the watch channel are 170 // available to read, the channel that gets read is picked at random, 171 // but we always want to stop if it's possible. 172 select { 173 case <-o.stopChan: 174 return 175 default: 176 } 177 178 select { 179 case event := <-o.watchChan: 180 o.handleEvent(ctx, event) 181 case <-o.stopChan: 182 // we also need to check for stop in here, in case there are no 183 // updates to cause the loop to turn over. 184 return 185 } 186 } 187 } 188 189 // handle event does the logic of handling one event message and calling the 190 // reconciler as needed. by handling the event logic in this function, we can 191 // make an end-run around the run-loop and avoid being at the mercy of the go 192 // scheduler when testing the orchestrator. 193 func (o *Orchestrator) handleEvent(ctx context.Context, event events.Event) { 194 var ( 195 service *api.Service 196 task *api.Task 197 ) 198 199 switch ev := event.(type) { 200 case api.EventCreateService: 201 service = ev.Service 202 case api.EventUpdateService: 203 service = ev.Service 204 case api.EventUpdateTask: 205 task = ev.Task 206 } 207 208 // if this is a task event, we should check if it means the service 209 // should be reconciled. 210 if task != nil { 211 // only bother with all this if the task has entered a terminal 212 // state and we don't want that to have happened. 213 if task.Status.State > api.TaskStateRunning && task.DesiredState <= api.TaskStateCompleted { 214 o.store.View(func(tx store.ReadTx) { 215 // if for any reason the service ID is invalid, then 216 // service will just be nil and nothing needs to be 217 // done 218 service = store.GetService(tx, task.ServiceID) 219 }) 220 } 221 } 222 223 if orchestrator.IsReplicatedJob(service) { 224 if err := o.replicatedReconciler.ReconcileService(service.ID); err != nil { 225 log.G(ctx).WithField( 226 "service.id", service.ID, 227 ).WithError(err).Error("error reconciling replicated job") 228 } 229 } 230 231 if orchestrator.IsGlobalJob(service) { 232 if err := o.globalReconciler.ReconcileService(service.ID); err != nil { 233 log.G(ctx).WithField( 234 "service.id", service.ID, 235 ).WithError(err).Error("error reconciling global job") 236 } 237 } 238 } 239 240 // Stop stops the Orchestrator 241 func (o *Orchestrator) Stop() { 242 // close stopChan inside of the Once so that there can be no races 243 // involving multiple attempts to close stopChan. 244 o.stopOnce.Do(func() { 245 close(o.stopChan) 246 }) 247 // now, we wait for the Orchestrator to stop. this wait is unqualified; we 248 // will not return until Orchestrator has stopped successfully. 249 <-o.doneChan 250 }