github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/deallocator/deallocator.go (about) 1 package deallocator 2 3 import ( 4 "context" 5 6 "github.com/docker/go-events" 7 "github.com/docker/swarmkit/api" 8 "github.com/docker/swarmkit/log" 9 "github.com/docker/swarmkit/manager/state/store" 10 ) 11 12 // Deallocator waits for services to fully shutdown (ie no containers left) 13 // and then proceeds to deallocate service-level resources (e.g. networks), 14 // and finally services themselves 15 // in particular, the Deallocator should be the only place where services, or 16 // service-level resources, are ever deleted! 17 // 18 // It’s worth noting that this new component’s role is quite different from 19 // the task reaper’s: tasks are purely internal to Swarmkit, and their status 20 // is entirely managed by the system itself. In contrast, the deallocator is 21 // responsible for safely deleting entities that are directly controlled by the 22 // user. 23 // 24 // NOTE: since networks are the only service-level resources as of now, 25 // it has been deemed over-engineered to have a generic way to 26 // handle other types of service-level resources; if we ever start 27 // having more of those and thus want to reconsider this choice, it 28 // might be worth having a look at this archived branch, that does 29 // implement a way of separating the code for the deallocator itself 30 // from each resource-speficic way of handling it 31 // https://github.com/docker/swarmkit/compare/a84c01f49091167dd086c26b45dc18b38d52e4d9...wk8:wk8/generic_deallocator#diff-75f4f75eee6a6a7a7268c672203ea0ac 32 type Deallocator struct { 33 store *store.MemoryStore 34 35 // for services that are shutting down, we keep track of how many 36 // tasks still exist for them 37 services map[string]*serviceWithTaskCounts 38 39 // mainly used for tests, so that we can peek 40 // into the DB state in between events 41 // the bool notifies whether any DB update was actually performed 42 eventChan chan bool 43 44 stopChan chan struct{} 45 doneChan chan struct{} 46 } 47 48 // used in our internal state's `services` right above 49 type serviceWithTaskCounts struct { 50 service *api.Service 51 taskCount int 52 } 53 54 // New creates a new deallocator 55 func New(store *store.MemoryStore) *Deallocator { 56 return &Deallocator{ 57 store: store, 58 services: make(map[string]*serviceWithTaskCounts), 59 60 stopChan: make(chan struct{}), 61 doneChan: make(chan struct{}), 62 } 63 } 64 65 // Run starts the deallocator, which then starts cleaning up services 66 // and their resources when relevant (ie when no tasks still exist 67 // for a given service) 68 // This is a blocking function 69 func (deallocator *Deallocator) Run(ctx context.Context) error { 70 var ( 71 allServices []*api.Service 72 allNetworks []*api.Network 73 ) 74 75 eventsChan, _, err := store.ViewAndWatch(deallocator.store, 76 func(readTx store.ReadTx) (err error) { 77 // look for services that are marked for deletion 78 // there's no index on the `PendingDelete` field in the store, 79 // so we just iterate over all of them and filter manually 80 // this is okay since we only do this at leadership change 81 allServices, err = store.FindServices(readTx, store.All) 82 83 if err != nil { 84 log.G(ctx).WithError(err).Error("failed to list services in deallocator init") 85 return err 86 } 87 88 // now we also need to look at all existing service-level networks 89 // that may be marked for deletion 90 if allNetworks, err = store.FindNetworks(readTx, store.All); err != nil { 91 log.G(ctx).WithError(err).Error("failed to list networks in deallocator init") 92 return err 93 } 94 95 return 96 }, 97 api.EventDeleteTask{}, 98 api.EventUpdateService{}, 99 api.EventUpdateNetwork{}) 100 101 if err != nil { 102 // if we have an error here, we can't proceed any further 103 log.G(ctx).WithError(err).Error("failed to initialize the deallocator") 104 return err 105 } 106 107 defer func() { 108 // eventsChanCancel() 109 close(deallocator.doneChan) 110 }() 111 112 anyUpdated := false 113 // now let's populate our internal taskCounts 114 for _, service := range allServices { 115 if updated, _ := deallocator.processService(ctx, service); updated { 116 anyUpdated = true 117 } 118 } 119 120 // and deallocate networks that may be marked for deletion and aren't used any more 121 for _, network := range allNetworks { 122 if updated, _ := deallocator.processNetwork(ctx, nil, network, nil); updated { 123 anyUpdated = true 124 } 125 } 126 127 // now we just need to wait for events 128 deallocator.notifyEventChan(anyUpdated) 129 for { 130 select { 131 case event := <-eventsChan: 132 if updated, err := deallocator.processNewEvent(ctx, event); err == nil { 133 deallocator.notifyEventChan(updated) 134 } else { 135 log.G(ctx).WithError(err).Errorf("error processing deallocator event %#v", event) 136 } 137 case <-deallocator.stopChan: 138 return nil 139 case <-ctx.Done(): 140 return ctx.Err() 141 } 142 } 143 } 144 145 // Stop stops the deallocator's routine 146 // FIXME (jrouge): see the comment on TaskReaper.Stop() and see when to properly stop this 147 // plus unit test on this! 148 func (deallocator *Deallocator) Stop() { 149 close(deallocator.stopChan) 150 <-deallocator.doneChan 151 } 152 153 // always a bno-op, except when running tests tests 154 // see the comment about `Deallocator`s' `eventChan` field 155 func (deallocator *Deallocator) notifyEventChan(updated bool) { 156 if deallocator.eventChan != nil { 157 deallocator.eventChan <- updated 158 } 159 } 160 161 // if a service is marked for deletion, this checks whether it's ready to be 162 // deleted yet, and does it if relevant 163 func (deallocator *Deallocator) processService(ctx context.Context, service *api.Service) (bool, error) { 164 if !service.PendingDelete { 165 return false, nil 166 } 167 168 var ( 169 tasks []*api.Task 170 err error 171 ) 172 173 deallocator.store.View(func(tx store.ReadTx) { 174 tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID)) 175 }) 176 177 if err != nil { 178 log.G(ctx).WithError(err).Errorf("failed to retrieve the list of tasks for service %v", service.ID) 179 // if in doubt, let's proceed to clean up the service anyway 180 // better to clean up resources that shouldn't be cleaned up yet 181 // than ending up with a service and some resources lost in limbo forever 182 return true, deallocator.deallocateService(ctx, service) 183 } else if len(tasks) == 0 { 184 // no tasks remaining for this service, we can clean it up 185 return true, deallocator.deallocateService(ctx, service) 186 } 187 deallocator.services[service.ID] = &serviceWithTaskCounts{service: service, taskCount: len(tasks)} 188 return false, nil 189 } 190 191 func (deallocator *Deallocator) deallocateService(ctx context.Context, service *api.Service) (err error) { 192 err = deallocator.store.Update(func(tx store.Tx) error { 193 // first, let's delete the service 194 var ignoreServiceID *string 195 if err := store.DeleteService(tx, service.ID); err != nil { 196 // all errors are just for logging here, we do a best effort at cleaning up everything we can 197 log.G(ctx).WithError(err).Errorf("failed to delete service record ID %v", service.ID) 198 ignoreServiceID = &service.ID 199 } 200 201 // then all of its networks, provided no other service uses them 202 spec := service.Spec 203 // see https://github.com/docker/swarmkit/blob/e2aafdd3453d2ab103dd97364f79ea6b857f9446/api/specs.proto#L80-L84 204 // we really should have a helper function on services to do this... 205 networkConfigs := spec.Task.Networks 206 if len(networkConfigs) == 0 { 207 networkConfigs = spec.Networks 208 } 209 for _, networkConfig := range networkConfigs { 210 if network := store.GetNetwork(tx, networkConfig.Target); network != nil { 211 deallocator.processNetwork(ctx, tx, network, ignoreServiceID) 212 } 213 } 214 215 return nil 216 }) 217 218 if err != nil { 219 log.G(ctx).WithError(err).Errorf("DB error when deallocating service %v", service.ID) 220 } 221 return 222 } 223 224 // proceeds to deallocate a network if it's pending deletion and there no 225 // longer are any services using it 226 // actually deletes the network if it's marked for deletion and no services are 227 // using it any more (or the only one using it has ID `ignoreServiceID`, if not 228 // nil - this comes in handy when there's been an error deleting a service) 229 // This function can be called either when deallocating a whole service, or 230 // because there was an `EventUpdateNetwork` event - in the former case, the 231 // transaction will be that of the service deallocation, in the latter it will be nil 232 func (deallocator *Deallocator) processNetwork(ctx context.Context, tx store.Tx, network *api.Network, ignoreServiceID *string) (updated bool, err error) { 233 if !network.PendingDelete { 234 return 235 } 236 237 updateFunc := func(t store.Tx) error { 238 services, err := store.FindServices(t, store.ByReferencedNetworkID(network.ID)) 239 240 if err != nil { 241 log.G(ctx).WithError(err).Errorf("could not fetch services using network ID %v", network.ID) 242 return err 243 } 244 245 noMoreServices := len(services) == 0 || 246 len(services) == 1 && ignoreServiceID != nil && services[0].ID == *ignoreServiceID 247 248 if noMoreServices { 249 return store.DeleteNetwork(t, network.ID) 250 } 251 return nil 252 } 253 254 if tx == nil { 255 err = deallocator.store.Update(updateFunc) 256 } else { 257 err = updateFunc(tx) 258 } 259 260 if err != nil { 261 log.G(ctx).WithError(err).Errorf("DB error when deallocating network ID %v", network.ID) 262 } 263 return 264 } 265 266 // Processes new events, and dispatches to the right method depending on what 267 // type of event it is. 268 // The boolean part of the return tuple indicates whether anything was actually 269 // removed from the store 270 func (deallocator *Deallocator) processNewEvent(ctx context.Context, event events.Event) (bool, error) { 271 switch typedEvent := event.(type) { 272 case api.EventDeleteTask: 273 serviceID := typedEvent.Task.ServiceID 274 275 if serviceWithCount, present := deallocator.services[serviceID]; present { 276 if serviceWithCount.taskCount <= 1 { 277 delete(deallocator.services, serviceID) 278 return deallocator.processService(ctx, serviceWithCount.service) 279 } 280 serviceWithCount.taskCount-- 281 } 282 283 return false, nil 284 case api.EventUpdateService: 285 return deallocator.processService(ctx, typedEvent.Service) 286 case api.EventUpdateNetwork: 287 return deallocator.processNetwork(ctx, nil, typedEvent.Network, nil) 288 default: 289 return false, nil 290 } 291 }