github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/role_manager.go (about)

     1  package manager
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	"code.cloudfoundry.org/clock"
     8  	"github.com/docker/swarmkit/api"
     9  	"github.com/docker/swarmkit/log"
    10  	"github.com/docker/swarmkit/manager/state/raft"
    11  	"github.com/docker/swarmkit/manager/state/raft/membership"
    12  	"github.com/docker/swarmkit/manager/state/store"
    13  )
    14  
    15  const (
    16  	// roleReconcileInterval is how often to retry removing a node, if a reconciliation or
    17  	// removal failed
    18  	roleReconcileInterval = 5 * time.Second
    19  
    20  	// removalTimeout is how long to wait before a raft member removal fails to be applied
    21  	// to the store
    22  	removalTimeout = 5 * time.Second
    23  )
    24  
    25  // roleManager reconciles the raft member list with desired role changes.
    26  type roleManager struct {
    27  	ctx    context.Context
    28  	cancel func()
    29  
    30  	store    *store.MemoryStore
    31  	raft     *raft.Node
    32  	doneChan chan struct{}
    33  
    34  	// pendingReconciliation contains changed nodes that have not yet been reconciled in
    35  	// the raft member list.
    36  	pendingReconciliation map[string]*api.Node
    37  
    38  	// pendingRemoval contains the IDs of nodes that have been deleted - if these correspond
    39  	// to members in the raft cluster, those members need to be removed from raft
    40  	pendingRemoval map[string]struct{}
    41  
    42  	// leave this nil except for tests which need to inject a fake time source
    43  	clocksource clock.Clock
    44  }
    45  
    46  // newRoleManager creates a new roleManager.
    47  func newRoleManager(store *store.MemoryStore, raftNode *raft.Node) *roleManager {
    48  	ctx, cancel := context.WithCancel(context.Background())
    49  	return &roleManager{
    50  		ctx:                   ctx,
    51  		cancel:                cancel,
    52  		store:                 store,
    53  		raft:                  raftNode,
    54  		doneChan:              make(chan struct{}),
    55  		pendingReconciliation: make(map[string]*api.Node),
    56  		pendingRemoval:        make(map[string]struct{}),
    57  	}
    58  }
    59  
    60  // getTicker returns a ticker based on the configured clock source
    61  func (rm *roleManager) getTicker(interval time.Duration) clock.Ticker {
    62  	if rm.clocksource == nil {
    63  		return clock.NewClock().NewTicker(interval)
    64  	}
    65  	return rm.clocksource.NewTicker(interval)
    66  
    67  }
    68  
    69  // Run is roleManager's main loop.  On startup, it looks at every node object in the cluster and
    70  // attempts to reconcile the raft member list with all the nodes' desired roles.  If any nodes
    71  // need to be demoted or promoted, it will add them to a reconciliation queue, and if any raft
    72  // members' node have been deleted, it will add them to a removal queue.
    73  
    74  // These queues are processed immediately, and any nodes that failed to be processed are
    75  // processed again in the next reconciliation interval, so that nodes will hopefully eventually
    76  // be reconciled.  As node updates come in, any promotions or demotions are also added to the
    77  // reconciliation queue and reconciled.  As node removals come in, they are added to the removal
    78  // queue to be removed from the raft cluster.
    79  
    80  // Removal from a raft cluster is idempotent (and it's the only raft cluster change that will occur
    81  // during reconciliation or removal), so it's fine if a node is in both the removal and reconciliation
    82  // queues.
    83  
    84  // The ctx param is only used for logging.
    85  func (rm *roleManager) Run(ctx context.Context) {
    86  	defer close(rm.doneChan)
    87  
    88  	var (
    89  		nodes []*api.Node
    90  
    91  		// ticker and tickerCh are used to time the reconciliation interval, which will
    92  		// periodically attempt to re-reconcile nodes that failed to reconcile the first
    93  		// time through
    94  		ticker   clock.Ticker
    95  		tickerCh <-chan time.Time
    96  	)
    97  
    98  	watcher, cancelWatch, err := store.ViewAndWatch(rm.store,
    99  		func(readTx store.ReadTx) error {
   100  			var err error
   101  			nodes, err = store.FindNodes(readTx, store.All)
   102  			return err
   103  		},
   104  		api.EventUpdateNode{},
   105  		api.EventDeleteNode{})
   106  	defer cancelWatch()
   107  
   108  	if err != nil {
   109  		log.G(ctx).WithError(err).Error("failed to check nodes for role changes")
   110  	} else {
   111  		// Assume all raft members have been deleted from the cluster, until the node list
   112  		// tells us otherwise.  We can make this assumption because the node object must
   113  		// exist first before the raft member object.
   114  
   115  		// Background life-cycle for a manager: it joins the cluster, getting a new TLS
   116  		// certificate. To get a TLS certificate, it makes an RPC call to the CA server,
   117  		// which on successful join adds its information to the cluster node list and
   118  		// eventually generates a TLS certificate for it. Once it has a TLS certificate,
   119  		// it can contact the other nodes, and makes an RPC call to request to join the
   120  		// raft cluster.  The node it contacts will add the node to the raft membership.
   121  		for _, member := range rm.raft.GetMemberlist() {
   122  			rm.pendingRemoval[member.NodeID] = struct{}{}
   123  		}
   124  		for _, node := range nodes {
   125  			// if the node exists, we don't want it removed from the raft membership cluster
   126  			// necessarily
   127  			delete(rm.pendingRemoval, node.ID)
   128  
   129  			// reconcile each existing node
   130  			rm.pendingReconciliation[node.ID] = node
   131  			rm.reconcileRole(ctx, node)
   132  		}
   133  		for nodeID := range rm.pendingRemoval {
   134  			rm.evictRemovedNode(ctx, nodeID)
   135  		}
   136  		// If any reconciliations or member removals failed, we want to try again, so
   137  		// make sure that we start the ticker so we can try again and again every
   138  		// roleReconciliationInterval seconds until the queues are both empty.
   139  		if len(rm.pendingReconciliation) != 0 || len(rm.pendingRemoval) != 0 {
   140  			ticker = rm.getTicker(roleReconcileInterval)
   141  			tickerCh = ticker.C()
   142  		}
   143  	}
   144  
   145  	for {
   146  		select {
   147  		case event := <-watcher:
   148  			switch ev := event.(type) {
   149  			case api.EventUpdateNode:
   150  				rm.pendingReconciliation[ev.Node.ID] = ev.Node
   151  				rm.reconcileRole(ctx, ev.Node)
   152  			case api.EventDeleteNode:
   153  				rm.pendingRemoval[ev.Node.ID] = struct{}{}
   154  				rm.evictRemovedNode(ctx, ev.Node.ID)
   155  			}
   156  			// If any reconciliations or member removals failed, we want to try again, so
   157  			// make sure that we start the ticker so we can try again and again every
   158  			// roleReconciliationInterval seconds until the queues are both empty.
   159  			if (len(rm.pendingReconciliation) != 0 || len(rm.pendingRemoval) != 0) && ticker == nil {
   160  				ticker = rm.getTicker(roleReconcileInterval)
   161  				tickerCh = ticker.C()
   162  			}
   163  		case <-tickerCh:
   164  			for _, node := range rm.pendingReconciliation {
   165  				rm.reconcileRole(ctx, node)
   166  			}
   167  			for nodeID := range rm.pendingRemoval {
   168  				rm.evictRemovedNode(ctx, nodeID)
   169  			}
   170  			if len(rm.pendingReconciliation) == 0 && len(rm.pendingRemoval) == 0 {
   171  				ticker.Stop()
   172  				ticker = nil
   173  				tickerCh = nil
   174  			}
   175  		case <-rm.ctx.Done():
   176  			if ticker != nil {
   177  				ticker.Stop()
   178  			}
   179  			return
   180  		}
   181  	}
   182  }
   183  
   184  // evictRemovedNode evicts a removed node from the raft cluster membership.  This is to cover an edge case in which
   185  // a node might have been removed, but somehow the role was not reconciled (possibly a demotion and a removal happened
   186  // in rapid succession before the raft membership configuration went through).
   187  func (rm *roleManager) evictRemovedNode(ctx context.Context, nodeID string) {
   188  	// Check if the member still exists in the membership
   189  	member := rm.raft.GetMemberByNodeID(nodeID)
   190  	if member != nil {
   191  		// We first try to remove the raft node from the raft cluster.  On the next tick, if the node
   192  		// has been removed from the cluster membership, we then delete it from the removed list
   193  		rm.removeMember(ctx, member)
   194  		return
   195  	}
   196  	delete(rm.pendingRemoval, nodeID)
   197  }
   198  
   199  // removeMember removes a member from the raft cluster membership
   200  func (rm *roleManager) removeMember(ctx context.Context, member *membership.Member) {
   201  	// Quorum safeguard - quorum should have been checked before a node was allowed to be demoted, but if in the
   202  	// intervening time some other node disconnected, removing this node would result in a loss of cluster quorum.
   203  	// We leave it
   204  	if !rm.raft.CanRemoveMember(member.RaftID) {
   205  		// TODO(aaronl): Retry later
   206  		log.G(ctx).Debugf("can't demote node %s at this time: removing member from raft would result in a loss of quorum", member.NodeID)
   207  		return
   208  	}
   209  
   210  	rmCtx, rmCancel := context.WithTimeout(rm.ctx, removalTimeout)
   211  	defer rmCancel()
   212  
   213  	if member.RaftID == rm.raft.Config.ID {
   214  		// Don't use rmCtx, because we expect to lose
   215  		// leadership, which will cancel this context.
   216  		log.G(ctx).Info("demoted; transferring leadership")
   217  		err := rm.raft.TransferLeadership(context.Background())
   218  		if err == nil {
   219  			return
   220  		}
   221  		log.G(ctx).WithError(err).Info("failed to transfer leadership")
   222  	}
   223  	if err := rm.raft.RemoveMember(rmCtx, member.RaftID); err != nil {
   224  		// TODO(aaronl): Retry later
   225  		log.G(ctx).WithError(err).Debugf("can't demote node %s at this time", member.NodeID)
   226  	}
   227  }
   228  
   229  // reconcileRole looks at the desired role for a node, and if it is being demoted or promoted, updates the
   230  // node role accordingly.   If the node is being demoted, it also removes the node from the raft cluster membership.
   231  func (rm *roleManager) reconcileRole(ctx context.Context, node *api.Node) {
   232  	if node.Role == node.Spec.DesiredRole {
   233  		// Nothing to do.
   234  		delete(rm.pendingReconciliation, node.ID)
   235  		return
   236  	}
   237  
   238  	// Promotion can proceed right away.
   239  	if node.Spec.DesiredRole == api.NodeRoleManager && node.Role == api.NodeRoleWorker {
   240  		err := rm.store.Update(func(tx store.Tx) error {
   241  			updatedNode := store.GetNode(tx, node.ID)
   242  			if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role {
   243  				return nil
   244  			}
   245  			updatedNode.Role = api.NodeRoleManager
   246  			return store.UpdateNode(tx, updatedNode)
   247  		})
   248  		if err != nil {
   249  			log.G(ctx).WithError(err).Errorf("failed to promote node %s", node.ID)
   250  		} else {
   251  			delete(rm.pendingReconciliation, node.ID)
   252  		}
   253  	} else if node.Spec.DesiredRole == api.NodeRoleWorker && node.Role == api.NodeRoleManager {
   254  		// Check for node in memberlist
   255  		member := rm.raft.GetMemberByNodeID(node.ID)
   256  		if member != nil {
   257  			// We first try to remove the raft node from the raft cluster.  On the next tick, if the node
   258  			// has been removed from the cluster membership, we then update the store to reflect the fact
   259  			// that it has been successfully demoted, and if that works, remove it from the pending list.
   260  			rm.removeMember(ctx, member)
   261  			return
   262  		}
   263  
   264  		err := rm.store.Update(func(tx store.Tx) error {
   265  			updatedNode := store.GetNode(tx, node.ID)
   266  			if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role {
   267  				return nil
   268  			}
   269  			updatedNode.Role = api.NodeRoleWorker
   270  
   271  			return store.UpdateNode(tx, updatedNode)
   272  		})
   273  		if err != nil {
   274  			log.G(ctx).WithError(err).Errorf("failed to demote node %s", node.ID)
   275  		} else {
   276  			delete(rm.pendingReconciliation, node.ID)
   277  		}
   278  	}
   279  }
   280  
   281  // Stop stops the roleManager and waits for the main loop to exit.
   282  func (rm *roleManager) Stop() {
   283  	rm.cancel()
   284  	<-rm.doneChan
   285  }