github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/ca/reconciler.go (about)

     1  package ca
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"reflect"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/cloudflare/cfssl/helpers"
    12  	"github.com/docker/swarmkit/api"
    13  	"github.com/docker/swarmkit/api/equality"
    14  	"github.com/docker/swarmkit/log"
    15  	"github.com/docker/swarmkit/manager/state/store"
    16  	"github.com/pkg/errors"
    17  )
    18  
    19  // IssuanceStateRotateMaxBatchSize is the maximum number of nodes we'll tell to rotate their certificates in any given update
    20  const IssuanceStateRotateMaxBatchSize = 30
    21  
    22  func hasIssuer(n *api.Node, info *IssuerInfo) bool {
    23  	if n.Description == nil || n.Description.TLSInfo == nil {
    24  		return false
    25  	}
    26  	return bytes.Equal(info.Subject, n.Description.TLSInfo.CertIssuerSubject) && bytes.Equal(info.PublicKey, n.Description.TLSInfo.CertIssuerPublicKey)
    27  }
    28  
    29  var errRootRotationChanged = errors.New("target root rotation has changed")
    30  
    31  // rootRotationReconciler keeps track of all the nodes in the store so that we can determine which ones need reconciliation when nodes are updated
    32  // or the root CA is updated.  This is meant to be used with watches on nodes and the cluster, and provides functions to be called when the
    33  // cluster's RootCA has changed and when a node is added, updated, or removed.
    34  type rootRotationReconciler struct {
    35  	mu                  sync.Mutex
    36  	clusterID           string
    37  	batchUpdateInterval time.Duration
    38  	ctx                 context.Context
    39  	store               *store.MemoryStore
    40  
    41  	currentRootCA    *api.RootCA
    42  	currentIssuer    IssuerInfo
    43  	unconvergedNodes map[string]*api.Node
    44  
    45  	wg     sync.WaitGroup
    46  	cancel func()
    47  }
    48  
    49  // IssuerFromAPIRootCA returns the desired issuer given an API root CA object
    50  func IssuerFromAPIRootCA(rootCA *api.RootCA) (*IssuerInfo, error) {
    51  	wantedIssuer := rootCA.CACert
    52  	if rootCA.RootRotation != nil {
    53  		wantedIssuer = rootCA.RootRotation.CACert
    54  	}
    55  	issuerCerts, err := helpers.ParseCertificatesPEM(wantedIssuer)
    56  	if err != nil {
    57  		return nil, errors.Wrap(err, "invalid certificate in cluster root CA object")
    58  	}
    59  	if len(issuerCerts) == 0 {
    60  		return nil, errors.New("invalid certificate in cluster root CA object")
    61  	}
    62  	return &IssuerInfo{
    63  		Subject:   issuerCerts[0].RawSubject,
    64  		PublicKey: issuerCerts[0].RawSubjectPublicKeyInfo,
    65  	}, nil
    66  }
    67  
    68  // assumption:  UpdateRootCA will never be called with a `nil` root CA because the caller will be acting in response to
    69  // a store update event
    70  func (r *rootRotationReconciler) UpdateRootCA(newRootCA *api.RootCA) {
    71  	issuerInfo, err := IssuerFromAPIRootCA(newRootCA)
    72  	if err != nil {
    73  		log.G(r.ctx).WithError(err).Error("unable to update process the current root CA")
    74  		return
    75  	}
    76  
    77  	var (
    78  		shouldStartNewLoop, waitForPrevLoop bool
    79  		loopCtx                             context.Context
    80  	)
    81  	r.mu.Lock()
    82  	defer func() {
    83  		r.mu.Unlock()
    84  		if shouldStartNewLoop {
    85  			if waitForPrevLoop {
    86  				r.wg.Wait()
    87  			}
    88  			r.wg.Add(1)
    89  			go r.runReconcilerLoop(loopCtx, newRootCA)
    90  		}
    91  	}()
    92  
    93  	// check if the issuer has changed, first
    94  	if reflect.DeepEqual(&r.currentIssuer, issuerInfo) {
    95  		r.currentRootCA = newRootCA
    96  		return
    97  	}
    98  	// If the issuer has changed, iterate through all the nodes to figure out which ones need rotation
    99  	if newRootCA.RootRotation != nil {
   100  		var nodes []*api.Node
   101  		r.store.View(func(tx store.ReadTx) {
   102  			nodes, err = store.FindNodes(tx, store.All)
   103  		})
   104  		if err != nil {
   105  			log.G(r.ctx).WithError(err).Error("unable to list nodes, so unable to process the current root CA")
   106  			return
   107  		}
   108  
   109  		// from here on out, there will be no more errors that cause us to have to abandon updating the Root CA,
   110  		// so we can start making changes to r's fields
   111  		r.unconvergedNodes = make(map[string]*api.Node)
   112  		for _, n := range nodes {
   113  			if !hasIssuer(n, issuerInfo) {
   114  				r.unconvergedNodes[n.ID] = n
   115  			}
   116  		}
   117  		shouldStartNewLoop = true
   118  		if r.cancel != nil { // there's already a loop going, so cancel it
   119  			r.cancel()
   120  			waitForPrevLoop = true
   121  		}
   122  		loopCtx, r.cancel = context.WithCancel(r.ctx)
   123  	} else {
   124  		r.unconvergedNodes = nil
   125  	}
   126  	r.currentRootCA = newRootCA
   127  	r.currentIssuer = *issuerInfo
   128  }
   129  
   130  // assumption:  UpdateNode will never be called with a `nil` node because the caller will be acting in response to
   131  // a store update event
   132  func (r *rootRotationReconciler) UpdateNode(node *api.Node) {
   133  	r.mu.Lock()
   134  	defer r.mu.Unlock()
   135  	// if we're not in the middle of a root rotation ignore the update
   136  	if r.currentRootCA == nil || r.currentRootCA.RootRotation == nil {
   137  		return
   138  	}
   139  	if hasIssuer(node, &r.currentIssuer) {
   140  		delete(r.unconvergedNodes, node.ID)
   141  	} else {
   142  		r.unconvergedNodes[node.ID] = node
   143  	}
   144  }
   145  
   146  // assumption:  DeleteNode will never be called with a `nil` node because the caller will be acting in response to
   147  // a store update event
   148  func (r *rootRotationReconciler) DeleteNode(node *api.Node) {
   149  	r.mu.Lock()
   150  	delete(r.unconvergedNodes, node.ID)
   151  	r.mu.Unlock()
   152  }
   153  
   154  func (r *rootRotationReconciler) runReconcilerLoop(ctx context.Context, loopRootCA *api.RootCA) {
   155  	defer r.wg.Done()
   156  	for {
   157  		r.mu.Lock()
   158  		if len(r.unconvergedNodes) == 0 {
   159  			r.mu.Unlock()
   160  
   161  			err := r.store.Update(func(tx store.Tx) error {
   162  				return r.finishRootRotation(tx, loopRootCA)
   163  			})
   164  			if err == nil {
   165  				log.G(r.ctx).Info("completed root rotation")
   166  				return
   167  			}
   168  			log.G(r.ctx).WithError(err).Error("could not complete root rotation")
   169  			if err == errRootRotationChanged {
   170  				// if the root rotation has changed, this loop will be cancelled anyway, so may as well abort early
   171  				return
   172  			}
   173  		} else {
   174  			var toUpdate []*api.Node
   175  			for _, n := range r.unconvergedNodes {
   176  				iState := n.Certificate.Status.State
   177  				if iState != api.IssuanceStateRenew && iState != api.IssuanceStatePending && iState != api.IssuanceStateRotate {
   178  					n = n.Copy()
   179  					n.Certificate.Status.State = api.IssuanceStateRotate
   180  					toUpdate = append(toUpdate, n)
   181  					if len(toUpdate) >= IssuanceStateRotateMaxBatchSize {
   182  						break
   183  					}
   184  				}
   185  			}
   186  			r.mu.Unlock()
   187  
   188  			if err := r.batchUpdateNodes(toUpdate); err != nil {
   189  				log.G(r.ctx).WithError(err).Errorf("store error when trying to batch update %d nodes to request certificate rotation", len(toUpdate))
   190  			}
   191  		}
   192  
   193  		select {
   194  		case <-ctx.Done():
   195  			return
   196  		case <-time.After(r.batchUpdateInterval):
   197  		}
   198  	}
   199  }
   200  
   201  // This function assumes that the expected root CA has root rotation.  This is intended to be used by
   202  // `reconcileNodeRootsAndCerts`, which uses the root CA from the `lastSeenClusterRootCA`, and checks
   203  // that it has a root rotation before calling this function.
   204  func (r *rootRotationReconciler) finishRootRotation(tx store.Tx, expectedRootCA *api.RootCA) error {
   205  	cluster := store.GetCluster(tx, r.clusterID)
   206  	if cluster == nil {
   207  		return fmt.Errorf("unable to get cluster %s", r.clusterID)
   208  	}
   209  
   210  	// If the RootCA object has changed (because another root rotation was started or because some other node
   211  	// had finished the root rotation), we cannot finish the root rotation that we were working on.
   212  	if !equality.RootCAEqualStable(expectedRootCA, &cluster.RootCA) {
   213  		return errRootRotationChanged
   214  	}
   215  
   216  	var signerCert []byte
   217  	if len(cluster.RootCA.RootRotation.CAKey) > 0 {
   218  		signerCert = cluster.RootCA.RootRotation.CACert
   219  	}
   220  	// we don't actually have to parse out the default node expiration from the cluster - we are just using
   221  	// the ca.RootCA object to generate new tokens and the digest
   222  	updatedRootCA, err := NewRootCA(cluster.RootCA.RootRotation.CACert, signerCert, cluster.RootCA.RootRotation.CAKey,
   223  		DefaultNodeCertExpiration, nil)
   224  	if err != nil {
   225  		return errors.Wrap(err, "invalid cluster root rotation object")
   226  	}
   227  	cluster.RootCA = api.RootCA{
   228  		CACert:     cluster.RootCA.RootRotation.CACert,
   229  		CAKey:      cluster.RootCA.RootRotation.CAKey,
   230  		CACertHash: updatedRootCA.Digest.String(),
   231  		JoinTokens: api.JoinTokens{
   232  			Worker:  GenerateJoinToken(&updatedRootCA, cluster.FIPS),
   233  			Manager: GenerateJoinToken(&updatedRootCA, cluster.FIPS),
   234  		},
   235  		LastForcedRotation: cluster.RootCA.LastForcedRotation,
   236  	}
   237  	return store.UpdateCluster(tx, cluster)
   238  }
   239  
   240  func (r *rootRotationReconciler) batchUpdateNodes(toUpdate []*api.Node) error {
   241  	if len(toUpdate) == 0 {
   242  		return nil
   243  	}
   244  	err := r.store.Batch(func(batch *store.Batch) error {
   245  		// Directly update the nodes rather than get + update, and ignore version errors.  Since
   246  		// `rootRotationReconciler` should be hooked up to all node update/delete/create events, we should have
   247  		// close to the latest versions of all the nodes.  If not, the node will updated later and the
   248  		// next batch of updates should catch it.
   249  		for _, n := range toUpdate {
   250  			if err := batch.Update(func(tx store.Tx) error {
   251  				return store.UpdateNode(tx, n)
   252  			}); err != nil && err != store.ErrSequenceConflict {
   253  				log.G(r.ctx).WithError(err).Errorf("unable to update node %s to request a certificate rotation", n.ID)
   254  			}
   255  		}
   256  		return nil
   257  	})
   258  	return err
   259  }