github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/deallocator/deallocator.go (about)

     1  package deallocator
     2  
     3  import (
     4  	"context"
     5  
     6  	"github.com/docker/go-events"
     7  	"github.com/docker/swarmkit/api"
     8  	"github.com/docker/swarmkit/log"
     9  	"github.com/docker/swarmkit/manager/state/store"
    10  )
    11  
    12  // Deallocator waits for services to fully shutdown (ie no containers left)
    13  // and then proceeds to deallocate service-level resources (e.g. networks),
    14  // and finally services themselves
    15  // in particular, the Deallocator should be the only place where services, or
    16  // service-level resources, are ever deleted!
    17  //
    18  // It’s worth noting that this new component’s role is quite different from
    19  // the task reaper’s: tasks are purely internal to Swarmkit, and their status
    20  // is entirely managed by the system itself. In contrast, the deallocator is
    21  // responsible for safely deleting entities that are directly controlled by the
    22  // user.
    23  //
    24  // NOTE: since networks are the only service-level resources as of now,
    25  // it has been deemed over-engineered to have a generic way to
    26  // handle other types of service-level resources; if we ever start
    27  // having more of those and thus want to reconsider this choice, it
    28  // might be worth having a look at this archived branch, that does
    29  // implement a way of separating the code for the deallocator itself
    30  // from each resource-speficic way of handling it
    31  // https://github.com/docker/swarmkit/compare/a84c01f49091167dd086c26b45dc18b38d52e4d9...wk8:wk8/generic_deallocator#diff-75f4f75eee6a6a7a7268c672203ea0ac
    32  type Deallocator struct {
    33  	store *store.MemoryStore
    34  
    35  	// for services that are shutting down, we keep track of how many
    36  	// tasks still exist for them
    37  	services map[string]*serviceWithTaskCounts
    38  
    39  	// mainly used for tests, so that we can peek
    40  	// into the DB state in between events
    41  	// the bool notifies whether any DB update was actually performed
    42  	eventChan chan bool
    43  
    44  	stopChan chan struct{}
    45  	doneChan chan struct{}
    46  }
    47  
    48  // used in our internal state's `services` right above
    49  type serviceWithTaskCounts struct {
    50  	service   *api.Service
    51  	taskCount int
    52  }
    53  
    54  // New creates a new deallocator
    55  func New(store *store.MemoryStore) *Deallocator {
    56  	return &Deallocator{
    57  		store:    store,
    58  		services: make(map[string]*serviceWithTaskCounts),
    59  
    60  		stopChan: make(chan struct{}),
    61  		doneChan: make(chan struct{}),
    62  	}
    63  }
    64  
    65  // Run starts the deallocator, which then starts cleaning up services
    66  // and their resources when relevant (ie when no tasks still exist
    67  // for a given service)
    68  // This is a blocking function
    69  func (deallocator *Deallocator) Run(ctx context.Context) error {
    70  	var (
    71  		allServices []*api.Service
    72  		allNetworks []*api.Network
    73  	)
    74  
    75  	eventsChan, _, err := store.ViewAndWatch(deallocator.store,
    76  		func(readTx store.ReadTx) (err error) {
    77  			// look for services that are marked for deletion
    78  			// there's no index on the `PendingDelete` field in the store,
    79  			// so we just iterate over all of them and filter manually
    80  			// this is okay since we only do this at leadership change
    81  			allServices, err = store.FindServices(readTx, store.All)
    82  
    83  			if err != nil {
    84  				log.G(ctx).WithError(err).Error("failed to list services in deallocator init")
    85  				return err
    86  			}
    87  
    88  			// now we also need to look at all existing service-level networks
    89  			// that may be marked for deletion
    90  			if allNetworks, err = store.FindNetworks(readTx, store.All); err != nil {
    91  				log.G(ctx).WithError(err).Error("failed to list networks in deallocator init")
    92  				return err
    93  			}
    94  
    95  			return
    96  		},
    97  		api.EventDeleteTask{},
    98  		api.EventUpdateService{},
    99  		api.EventUpdateNetwork{})
   100  
   101  	if err != nil {
   102  		// if we have an error here, we can't proceed any further
   103  		log.G(ctx).WithError(err).Error("failed to initialize the deallocator")
   104  		return err
   105  	}
   106  
   107  	defer func() {
   108  		// eventsChanCancel()
   109  		close(deallocator.doneChan)
   110  	}()
   111  
   112  	anyUpdated := false
   113  	// now let's populate our internal taskCounts
   114  	for _, service := range allServices {
   115  		if updated, _ := deallocator.processService(ctx, service); updated {
   116  			anyUpdated = true
   117  		}
   118  	}
   119  
   120  	// and deallocate networks that may be marked for deletion and aren't used any more
   121  	for _, network := range allNetworks {
   122  		if updated, _ := deallocator.processNetwork(ctx, nil, network, nil); updated {
   123  			anyUpdated = true
   124  		}
   125  	}
   126  
   127  	// now we just need to wait for events
   128  	deallocator.notifyEventChan(anyUpdated)
   129  	for {
   130  		select {
   131  		case event := <-eventsChan:
   132  			if updated, err := deallocator.processNewEvent(ctx, event); err == nil {
   133  				deallocator.notifyEventChan(updated)
   134  			} else {
   135  				log.G(ctx).WithError(err).Errorf("error processing deallocator event %#v", event)
   136  			}
   137  		case <-deallocator.stopChan:
   138  			return nil
   139  		case <-ctx.Done():
   140  			return ctx.Err()
   141  		}
   142  	}
   143  }
   144  
   145  // Stop stops the deallocator's routine
   146  // FIXME (jrouge): see the comment on TaskReaper.Stop() and see when to properly stop this
   147  // plus unit test on this!
   148  func (deallocator *Deallocator) Stop() {
   149  	close(deallocator.stopChan)
   150  	<-deallocator.doneChan
   151  }
   152  
   153  // always a bno-op, except when running tests tests
   154  // see the comment about `Deallocator`s' `eventChan` field
   155  func (deallocator *Deallocator) notifyEventChan(updated bool) {
   156  	if deallocator.eventChan != nil {
   157  		deallocator.eventChan <- updated
   158  	}
   159  }
   160  
   161  // if a service is marked for deletion, this checks whether it's ready to be
   162  // deleted yet, and does it if relevant
   163  func (deallocator *Deallocator) processService(ctx context.Context, service *api.Service) (bool, error) {
   164  	if !service.PendingDelete {
   165  		return false, nil
   166  	}
   167  
   168  	var (
   169  		tasks []*api.Task
   170  		err   error
   171  	)
   172  
   173  	deallocator.store.View(func(tx store.ReadTx) {
   174  		tasks, err = store.FindTasks(tx, store.ByServiceID(service.ID))
   175  	})
   176  
   177  	if err != nil {
   178  		log.G(ctx).WithError(err).Errorf("failed to retrieve the list of tasks for service %v", service.ID)
   179  		// if in doubt, let's proceed to clean up the service anyway
   180  		// better to clean up resources that shouldn't be cleaned up yet
   181  		// than ending up with a service and some resources lost in limbo forever
   182  		return true, deallocator.deallocateService(ctx, service)
   183  	} else if len(tasks) == 0 {
   184  		// no tasks remaining for this service, we can clean it up
   185  		return true, deallocator.deallocateService(ctx, service)
   186  	}
   187  	deallocator.services[service.ID] = &serviceWithTaskCounts{service: service, taskCount: len(tasks)}
   188  	return false, nil
   189  }
   190  
   191  func (deallocator *Deallocator) deallocateService(ctx context.Context, service *api.Service) (err error) {
   192  	err = deallocator.store.Update(func(tx store.Tx) error {
   193  		// first, let's delete the service
   194  		var ignoreServiceID *string
   195  		if err := store.DeleteService(tx, service.ID); err != nil {
   196  			// all errors are just for logging here, we do a best effort at cleaning up everything we can
   197  			log.G(ctx).WithError(err).Errorf("failed to delete service record ID %v", service.ID)
   198  			ignoreServiceID = &service.ID
   199  		}
   200  
   201  		// then all of its networks, provided no other service uses them
   202  		spec := service.Spec
   203  		// see https://github.com/docker/swarmkit/blob/e2aafdd3453d2ab103dd97364f79ea6b857f9446/api/specs.proto#L80-L84
   204  		// we really should have a helper function on services to do this...
   205  		networkConfigs := spec.Task.Networks
   206  		if len(networkConfigs) == 0 {
   207  			networkConfigs = spec.Networks
   208  		}
   209  		for _, networkConfig := range networkConfigs {
   210  			if network := store.GetNetwork(tx, networkConfig.Target); network != nil {
   211  				deallocator.processNetwork(ctx, tx, network, ignoreServiceID)
   212  			}
   213  		}
   214  
   215  		return nil
   216  	})
   217  
   218  	if err != nil {
   219  		log.G(ctx).WithError(err).Errorf("DB error when deallocating service %v", service.ID)
   220  	}
   221  	return
   222  }
   223  
   224  // proceeds to deallocate a network if it's pending deletion and there no
   225  // longer are any services using it
   226  // actually deletes the network if it's marked for deletion and no services are
   227  // using it any more (or the only one using it has ID `ignoreServiceID`, if not
   228  // nil - this comes in handy when there's been an error deleting a service)
   229  // This function can be called either when deallocating a whole service, or
   230  // because there was an `EventUpdateNetwork` event - in the former case, the
   231  // transaction will be that of the service deallocation, in the latter it will be nil
   232  func (deallocator *Deallocator) processNetwork(ctx context.Context, tx store.Tx, network *api.Network, ignoreServiceID *string) (updated bool, err error) {
   233  	if !network.PendingDelete {
   234  		return
   235  	}
   236  
   237  	updateFunc := func(t store.Tx) error {
   238  		services, err := store.FindServices(t, store.ByReferencedNetworkID(network.ID))
   239  
   240  		if err != nil {
   241  			log.G(ctx).WithError(err).Errorf("could not fetch services using network ID %v", network.ID)
   242  			return err
   243  		}
   244  
   245  		noMoreServices := len(services) == 0 ||
   246  			len(services) == 1 && ignoreServiceID != nil && services[0].ID == *ignoreServiceID
   247  
   248  		if noMoreServices {
   249  			return store.DeleteNetwork(t, network.ID)
   250  		}
   251  		return nil
   252  	}
   253  
   254  	if tx == nil {
   255  		err = deallocator.store.Update(updateFunc)
   256  	} else {
   257  		err = updateFunc(tx)
   258  	}
   259  
   260  	if err != nil {
   261  		log.G(ctx).WithError(err).Errorf("DB error when deallocating network ID %v", network.ID)
   262  	}
   263  	return
   264  }
   265  
   266  // Processes new events, and dispatches to the right method depending on what
   267  // type of event it is.
   268  // The boolean part of the return tuple indicates whether anything was actually
   269  // removed from the store
   270  func (deallocator *Deallocator) processNewEvent(ctx context.Context, event events.Event) (bool, error) {
   271  	switch typedEvent := event.(type) {
   272  	case api.EventDeleteTask:
   273  		serviceID := typedEvent.Task.ServiceID
   274  
   275  		if serviceWithCount, present := deallocator.services[serviceID]; present {
   276  			if serviceWithCount.taskCount <= 1 {
   277  				delete(deallocator.services, serviceID)
   278  				return deallocator.processService(ctx, serviceWithCount.service)
   279  			}
   280  			serviceWithCount.taskCount--
   281  		}
   282  
   283  		return false, nil
   284  	case api.EventUpdateService:
   285  		return deallocator.processService(ctx, typedEvent.Service)
   286  	case api.EventUpdateNetwork:
   287  		return deallocator.processNetwork(ctx, nil, typedEvent.Network, nil)
   288  	default:
   289  		return false, nil
   290  	}
   291  }