github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/ca/server.go (about)

     1  package ca
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"crypto/subtle"
     7  	"crypto/x509"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/docker/swarmkit/api"
    12  	"github.com/docker/swarmkit/api/equality"
    13  	"github.com/docker/swarmkit/identity"
    14  	"github.com/docker/swarmkit/log"
    15  	"github.com/docker/swarmkit/manager/state/store"
    16  	gogotypes "github.com/gogo/protobuf/types"
    17  	"github.com/pkg/errors"
    18  	"github.com/sirupsen/logrus"
    19  	"google.golang.org/grpc/codes"
    20  	"google.golang.org/grpc/status"
    21  )
    22  
    23  const (
    24  	defaultReconciliationRetryInterval = 10 * time.Second
    25  	defaultRootReconciliationInterval  = 3 * time.Second
    26  )
    27  
    28  // Server is the CA and NodeCA API gRPC server.
    29  // TODO(aaronl): At some point we may want to have separate implementations of
    30  // CA, NodeCA, and other hypothetical future CA services. At the moment,
    31  // breaking it apart doesn't seem worth it.
    32  type Server struct {
    33  	mu                          sync.Mutex
    34  	wg                          sync.WaitGroup
    35  	ctx                         context.Context
    36  	cancel                      func()
    37  	store                       *store.MemoryStore
    38  	securityConfig              *SecurityConfig
    39  	clusterID                   string
    40  	localRootCA                 *RootCA
    41  	externalCA                  *ExternalCA
    42  	externalCAPool              *x509.CertPool
    43  	joinTokens                  *api.JoinTokens
    44  	reconciliationRetryInterval time.Duration
    45  
    46  	// pending is a map of nodes with pending certificates issuance or
    47  	// renewal. They are indexed by node ID.
    48  	pending map[string]*api.Node
    49  
    50  	// started is a channel which gets closed once the server is running
    51  	// and able to service RPCs.
    52  	started chan struct{}
    53  
    54  	// these are cached values to ensure we only update the security config when
    55  	// the cluster root CA and external CAs have changed - the cluster object
    56  	// can change for other reasons, and it would not be necessary to update
    57  	// the security config as a result
    58  	lastSeenClusterRootCA *api.RootCA
    59  	lastSeenExternalCAs   []*api.ExternalCA
    60  
    61  	// This mutex protects the components of the CA server used to issue new certificates
    62  	// (and any attributes used to update those components): `lastSeenClusterRootCA` and
    63  	// `lastSeenExternalCA`, which are used to update `externalCA` and the `rootCA` object
    64  	// of the SecurityConfig
    65  	signingMu sync.Mutex
    66  
    67  	// lets us monitor and finish root rotations
    68  	rootReconciler                  *rootRotationReconciler
    69  	rootReconciliationRetryInterval time.Duration
    70  }
    71  
    72  // DefaultCAConfig returns the default CA Config, with a default expiration.
    73  func DefaultCAConfig() api.CAConfig {
    74  	return api.CAConfig{
    75  		NodeCertExpiry: gogotypes.DurationProto(DefaultNodeCertExpiration),
    76  	}
    77  }
    78  
    79  // NewServer creates a CA API server.
    80  func NewServer(store *store.MemoryStore, securityConfig *SecurityConfig) *Server {
    81  	return &Server{
    82  		store:                           store,
    83  		securityConfig:                  securityConfig,
    84  		localRootCA:                     securityConfig.RootCA(),
    85  		externalCA:                      NewExternalCA(nil, nil),
    86  		pending:                         make(map[string]*api.Node),
    87  		started:                         make(chan struct{}),
    88  		reconciliationRetryInterval:     defaultReconciliationRetryInterval,
    89  		rootReconciliationRetryInterval: defaultRootReconciliationInterval,
    90  		clusterID:                       securityConfig.ClientTLSCreds.Organization(),
    91  	}
    92  }
    93  
    94  // ExternalCA returns the current external CA - this is exposed to support unit testing only, and the external CA
    95  // should really be a private field
    96  func (s *Server) ExternalCA() *ExternalCA {
    97  	s.signingMu.Lock()
    98  	defer s.signingMu.Unlock()
    99  	return s.externalCA
   100  }
   101  
   102  // RootCA returns the current local root CA - this is exposed to support unit testing only, and the root CA
   103  // should really be a private field
   104  func (s *Server) RootCA() *RootCA {
   105  	s.signingMu.Lock()
   106  	defer s.signingMu.Unlock()
   107  	return s.localRootCA
   108  }
   109  
   110  // SetReconciliationRetryInterval changes the time interval between
   111  // reconciliation attempts. This function must be called before Run.
   112  func (s *Server) SetReconciliationRetryInterval(reconciliationRetryInterval time.Duration) {
   113  	s.reconciliationRetryInterval = reconciliationRetryInterval
   114  }
   115  
   116  // SetRootReconciliationInterval changes the time interval between root rotation
   117  // reconciliation attempts.  This function must be called before Run.
   118  func (s *Server) SetRootReconciliationInterval(interval time.Duration) {
   119  	s.rootReconciliationRetryInterval = interval
   120  }
   121  
   122  // GetUnlockKey is responsible for returning the current unlock key used for encrypting TLS private keys and
   123  // other at rest data.  Access to this RPC call should only be allowed via mutual TLS from managers.
   124  func (s *Server) GetUnlockKey(ctx context.Context, request *api.GetUnlockKeyRequest) (*api.GetUnlockKeyResponse, error) {
   125  	// This directly queries the store, rather than storing the unlock key and version on
   126  	// the `Server` object and updating it `updateCluster` is called, because we need this
   127  	// API to return the latest version of the key.  Otherwise, there might be a slight delay
   128  	// between when the cluster gets updated, and when this function returns the latest key.
   129  	// This delay is currently unacceptable because this RPC call is the only way, after a
   130  	// cluster update, to get the actual value of the unlock key, and we don't want to return
   131  	// a cached value.
   132  	resp := api.GetUnlockKeyResponse{}
   133  	s.store.View(func(tx store.ReadTx) {
   134  		cluster := store.GetCluster(tx, s.clusterID)
   135  		resp.Version = cluster.Meta.Version
   136  		if cluster.Spec.EncryptionConfig.AutoLockManagers {
   137  			for _, encryptionKey := range cluster.UnlockKeys {
   138  				if encryptionKey.Subsystem == ManagerRole {
   139  					resp.UnlockKey = encryptionKey.Key
   140  					return
   141  				}
   142  			}
   143  		}
   144  	})
   145  
   146  	return &resp, nil
   147  }
   148  
   149  // NodeCertificateStatus returns the current issuance status of an issuance request identified by the nodeID
   150  func (s *Server) NodeCertificateStatus(ctx context.Context, request *api.NodeCertificateStatusRequest) (*api.NodeCertificateStatusResponse, error) {
   151  	if request.NodeID == "" {
   152  		return nil, status.Errorf(codes.InvalidArgument, codes.InvalidArgument.String())
   153  	}
   154  
   155  	serverCtx, err := s.isRunningLocked()
   156  	if err != nil {
   157  		return nil, err
   158  	}
   159  
   160  	var node *api.Node
   161  
   162  	event := api.EventUpdateNode{
   163  		Node:   &api.Node{ID: request.NodeID},
   164  		Checks: []api.NodeCheckFunc{api.NodeCheckID},
   165  	}
   166  
   167  	// Retrieve the current value of the certificate with this token, and create a watcher
   168  	updates, cancel, err := store.ViewAndWatch(
   169  		s.store,
   170  		func(tx store.ReadTx) error {
   171  			node = store.GetNode(tx, request.NodeID)
   172  			return nil
   173  		},
   174  		event,
   175  	)
   176  	if err != nil {
   177  		return nil, err
   178  	}
   179  	defer cancel()
   180  
   181  	// This node ID doesn't exist
   182  	if node == nil {
   183  		return nil, status.Errorf(codes.NotFound, codes.NotFound.String())
   184  	}
   185  
   186  	log.G(ctx).WithFields(logrus.Fields{
   187  		"node.id": node.ID,
   188  		"status":  node.Certificate.Status,
   189  		"method":  "NodeCertificateStatus",
   190  	})
   191  
   192  	// If this certificate has a final state, return it immediately (both pending and renew are transition states)
   193  	if isFinalState(node.Certificate.Status) {
   194  		return &api.NodeCertificateStatusResponse{
   195  			Status:      &node.Certificate.Status,
   196  			Certificate: &node.Certificate,
   197  		}, nil
   198  	}
   199  
   200  	log.G(ctx).WithFields(logrus.Fields{
   201  		"node.id": node.ID,
   202  		"status":  node.Certificate.Status,
   203  		"method":  "NodeCertificateStatus",
   204  	}).Debugf("started watching for certificate updates")
   205  
   206  	// Certificate is Pending or in an Unknown state, let's wait for changes.
   207  	for {
   208  		select {
   209  		case event := <-updates:
   210  			switch v := event.(type) {
   211  			case api.EventUpdateNode:
   212  				// We got an update on the certificate record. If the status is a final state,
   213  				// return the certificate.
   214  				if isFinalState(v.Node.Certificate.Status) {
   215  					cert := v.Node.Certificate.Copy()
   216  					return &api.NodeCertificateStatusResponse{
   217  						Status:      &cert.Status,
   218  						Certificate: cert,
   219  					}, nil
   220  				}
   221  			}
   222  		case <-ctx.Done():
   223  			return nil, ctx.Err()
   224  		case <-serverCtx.Done():
   225  			return nil, s.ctx.Err()
   226  		}
   227  	}
   228  }
   229  
   230  // IssueNodeCertificate is responsible for gatekeeping both certificate requests from new nodes in the swarm,
   231  // and authorizing certificate renewals.
   232  // If a node presented a valid certificate, the corresponding certificate is set in a RENEW state.
   233  // If a node failed to present a valid certificate, we check for a valid join token and set the
   234  // role accordingly. A new random node ID is generated, and the corresponding node entry is created.
   235  // IssueNodeCertificate is the only place where new node entries to raft should be created.
   236  func (s *Server) IssueNodeCertificate(ctx context.Context, request *api.IssueNodeCertificateRequest) (*api.IssueNodeCertificateResponse, error) {
   237  	// First, let's see if the remote node is presenting a non-empty CSR
   238  	if len(request.CSR) == 0 {
   239  		return nil, status.Errorf(codes.InvalidArgument, codes.InvalidArgument.String())
   240  	}
   241  
   242  	if err := s.isReadyLocked(); err != nil {
   243  		return nil, err
   244  	}
   245  
   246  	var (
   247  		blacklistedCerts map[string]*api.BlacklistedCertificate
   248  		clusters         []*api.Cluster
   249  		err              error
   250  	)
   251  
   252  	s.store.View(func(readTx store.ReadTx) {
   253  		clusters, err = store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
   254  	})
   255  
   256  	// Not having a cluster object yet means we can't check
   257  	// the blacklist.
   258  	if err == nil && len(clusters) == 1 {
   259  		blacklistedCerts = clusters[0].BlacklistedCertificates
   260  	}
   261  
   262  	// Renewing the cert with a local (unix socket) is always valid.
   263  	localNodeInfo := ctx.Value(LocalRequestKey)
   264  	if localNodeInfo != nil {
   265  		nodeInfo, ok := localNodeInfo.(RemoteNodeInfo)
   266  		if ok && nodeInfo.NodeID != "" {
   267  			return s.issueRenewCertificate(ctx, nodeInfo.NodeID, request.CSR)
   268  		}
   269  	}
   270  
   271  	// If the remote node is a worker (either forwarded by a manager, or calling directly),
   272  	// issue a renew worker certificate entry with the correct ID
   273  	nodeID, err := AuthorizeForwardedRoleAndOrg(ctx, []string{WorkerRole}, []string{ManagerRole}, s.clusterID, blacklistedCerts)
   274  	if err == nil {
   275  		return s.issueRenewCertificate(ctx, nodeID, request.CSR)
   276  	}
   277  
   278  	// If the remote node is a manager (either forwarded by another manager, or calling directly),
   279  	// issue a renew certificate entry with the correct ID
   280  	nodeID, err = AuthorizeForwardedRoleAndOrg(ctx, []string{ManagerRole}, []string{ManagerRole}, s.clusterID, blacklistedCerts)
   281  	if err == nil {
   282  		return s.issueRenewCertificate(ctx, nodeID, request.CSR)
   283  	}
   284  
   285  	// The remote node didn't successfully present a valid MTLS certificate, let's issue a
   286  	// certificate with a new random ID
   287  	role := api.NodeRole(-1)
   288  
   289  	s.mu.Lock()
   290  	if subtle.ConstantTimeCompare([]byte(s.joinTokens.Manager), []byte(request.Token)) == 1 {
   291  		role = api.NodeRoleManager
   292  	} else if subtle.ConstantTimeCompare([]byte(s.joinTokens.Worker), []byte(request.Token)) == 1 {
   293  		role = api.NodeRoleWorker
   294  	}
   295  	s.mu.Unlock()
   296  
   297  	if role < 0 {
   298  		return nil, status.Errorf(codes.InvalidArgument, "A valid join token is necessary to join this cluster")
   299  	}
   300  
   301  	// Max number of collisions of ID or CN to tolerate before giving up
   302  	maxRetries := 3
   303  	// Generate a random ID for this new node
   304  	for i := 0; ; i++ {
   305  		nodeID = identity.NewID()
   306  
   307  		// Create a new node
   308  		err := s.store.Update(func(tx store.Tx) error {
   309  			node := &api.Node{
   310  				Role: role,
   311  				ID:   nodeID,
   312  				Certificate: api.Certificate{
   313  					CSR:  request.CSR,
   314  					CN:   nodeID,
   315  					Role: role,
   316  					Status: api.IssuanceStatus{
   317  						State: api.IssuanceStatePending,
   318  					},
   319  				},
   320  				Spec: api.NodeSpec{
   321  					DesiredRole:  role,
   322  					Membership:   api.NodeMembershipAccepted,
   323  					Availability: request.Availability,
   324  				},
   325  			}
   326  			node.VXLANUDPPort = clusters[0].VXLANUDPPort
   327  			return store.CreateNode(tx, node)
   328  		})
   329  		if err == nil {
   330  			log.G(ctx).WithFields(logrus.Fields{
   331  				"node.id":   nodeID,
   332  				"node.role": role,
   333  				"method":    "IssueNodeCertificate",
   334  			}).Debugf("new certificate entry added")
   335  			break
   336  		}
   337  		if err != store.ErrExist {
   338  			return nil, err
   339  		}
   340  		if i == maxRetries {
   341  			return nil, err
   342  		}
   343  		log.G(ctx).WithFields(logrus.Fields{
   344  			"node.id":   nodeID,
   345  			"node.role": role,
   346  			"method":    "IssueNodeCertificate",
   347  		}).Errorf("randomly generated node ID collided with an existing one - retrying")
   348  	}
   349  
   350  	return &api.IssueNodeCertificateResponse{
   351  		NodeID:         nodeID,
   352  		NodeMembership: api.NodeMembershipAccepted,
   353  	}, nil
   354  }
   355  
   356  // issueRenewCertificate receives a nodeID and a CSR and modifies the node's certificate entry with the new CSR
   357  // and changes the state to RENEW, so it can be picked up and signed by the signing reconciliation loop
   358  func (s *Server) issueRenewCertificate(ctx context.Context, nodeID string, csr []byte) (*api.IssueNodeCertificateResponse, error) {
   359  	var (
   360  		cert api.Certificate
   361  		node *api.Node
   362  	)
   363  	err := s.store.Update(func(tx store.Tx) error {
   364  		// Attempt to retrieve the node with nodeID
   365  		node = store.GetNode(tx, nodeID)
   366  		if node == nil {
   367  			log.G(ctx).WithFields(logrus.Fields{
   368  				"node.id": nodeID,
   369  				"method":  "issueRenewCertificate",
   370  			}).Warnf("node does not exist")
   371  			// If this node doesn't exist, we shouldn't be renewing a certificate for it
   372  			return status.Errorf(codes.NotFound, "node %s not found when attempting to renew certificate", nodeID)
   373  		}
   374  
   375  		// Create a new Certificate entry for this node with the new CSR and a RENEW state
   376  		cert = api.Certificate{
   377  			CSR:  csr,
   378  			CN:   node.ID,
   379  			Role: node.Role,
   380  			Status: api.IssuanceStatus{
   381  				State: api.IssuanceStateRenew,
   382  			},
   383  		}
   384  
   385  		node.Certificate = cert
   386  		return store.UpdateNode(tx, node)
   387  	})
   388  	if err != nil {
   389  		return nil, err
   390  	}
   391  
   392  	log.G(ctx).WithFields(logrus.Fields{
   393  		"cert.cn":   cert.CN,
   394  		"cert.role": cert.Role,
   395  		"method":    "issueRenewCertificate",
   396  	}).Debugf("node certificate updated")
   397  
   398  	return &api.IssueNodeCertificateResponse{
   399  		NodeID:         nodeID,
   400  		NodeMembership: node.Spec.Membership,
   401  	}, nil
   402  }
   403  
   404  // GetRootCACertificate returns the certificate of the Root CA. It is used as a convenience for distributing
   405  // the root of trust for the swarm. Clients should be using the CA hash to verify if they weren't target to
   406  // a MiTM. If they fail to do so, node bootstrap works with TOFU semantics.
   407  func (s *Server) GetRootCACertificate(ctx context.Context, request *api.GetRootCACertificateRequest) (*api.GetRootCACertificateResponse, error) {
   408  	log.G(ctx).WithFields(logrus.Fields{
   409  		"method": "GetRootCACertificate",
   410  	})
   411  
   412  	s.signingMu.Lock()
   413  	defer s.signingMu.Unlock()
   414  
   415  	return &api.GetRootCACertificateResponse{
   416  		Certificate: s.localRootCA.Certs,
   417  	}, nil
   418  }
   419  
   420  // Run runs the CA signer main loop.
   421  // The CA signer can be stopped with cancelling ctx or calling Stop().
   422  func (s *Server) Run(ctx context.Context) error {
   423  	s.mu.Lock()
   424  	if s.isRunning() {
   425  		s.mu.Unlock()
   426  		return errors.New("CA signer is already running")
   427  	}
   428  	s.wg.Add(1)
   429  	s.ctx, s.cancel = context.WithCancel(log.WithModule(ctx, "ca"))
   430  	ctx = s.ctx
   431  	s.mu.Unlock()
   432  	defer s.wg.Done()
   433  	defer func() {
   434  		s.mu.Lock()
   435  		s.mu.Unlock()
   436  	}()
   437  
   438  	// Retrieve the channels to keep track of changes in the cluster
   439  	// Retrieve all the currently registered nodes
   440  	var (
   441  		nodes   []*api.Node
   442  		cluster *api.Cluster
   443  		err     error
   444  	)
   445  	updates, cancel, err := store.ViewAndWatch(
   446  		s.store,
   447  		func(readTx store.ReadTx) error {
   448  			cluster = store.GetCluster(readTx, s.clusterID)
   449  			if cluster == nil {
   450  				return errors.New("could not find cluster object")
   451  			}
   452  			nodes, err = store.FindNodes(readTx, store.All)
   453  			return err
   454  		},
   455  		api.EventCreateNode{},
   456  		api.EventUpdateNode{},
   457  		api.EventDeleteNode{},
   458  		api.EventUpdateCluster{
   459  			Cluster: &api.Cluster{ID: s.clusterID},
   460  			Checks:  []api.ClusterCheckFunc{api.ClusterCheckID},
   461  		},
   462  	)
   463  
   464  	// call once to ensure that the join tokens and local/external CA signer are always set
   465  	rootReconciler := &rootRotationReconciler{
   466  		ctx:                 log.WithField(ctx, "method", "(*Server).rootRotationReconciler"),
   467  		clusterID:           s.clusterID,
   468  		store:               s.store,
   469  		batchUpdateInterval: s.rootReconciliationRetryInterval,
   470  	}
   471  
   472  	s.UpdateRootCA(ctx, cluster, rootReconciler)
   473  
   474  	// Do this after updateCluster has been called, so Ready() and isRunning never returns true without
   475  	// the join tokens and external CA/security config's root CA being set correctly
   476  	s.mu.Lock()
   477  	close(s.started)
   478  	s.mu.Unlock()
   479  
   480  	if err != nil {
   481  		log.G(ctx).WithFields(logrus.Fields{
   482  			"method": "(*Server).Run",
   483  		}).WithError(err).Errorf("snapshot store view failed")
   484  		return err
   485  	}
   486  	defer cancel()
   487  
   488  	// We might have missed some updates if there was a leader election,
   489  	// so let's pick up the slack.
   490  	if err := s.reconcileNodeCertificates(ctx, nodes); err != nil {
   491  		// We don't return here because that means the Run loop would
   492  		// never run. Log an error instead.
   493  		log.G(ctx).WithFields(logrus.Fields{
   494  			"method": "(*Server).Run",
   495  		}).WithError(err).Errorf("error attempting to reconcile certificates")
   496  	}
   497  
   498  	ticker := time.NewTicker(s.reconciliationRetryInterval)
   499  	defer ticker.Stop()
   500  
   501  	externalTLSCredsChange, externalTLSWatchCancel := s.securityConfig.Watch()
   502  	defer externalTLSWatchCancel()
   503  
   504  	// Watch for new nodes being created, new nodes being updated, and changes
   505  	// to the cluster
   506  	for {
   507  		select {
   508  		case <-ctx.Done():
   509  			return nil
   510  		default:
   511  		}
   512  
   513  		select {
   514  		case event := <-updates:
   515  			switch v := event.(type) {
   516  			case api.EventCreateNode:
   517  				s.evaluateAndSignNodeCert(ctx, v.Node)
   518  				rootReconciler.UpdateNode(v.Node)
   519  			case api.EventUpdateNode:
   520  				// If this certificate is already at a final state
   521  				// no need to evaluate and sign it.
   522  				if !isFinalState(v.Node.Certificate.Status) {
   523  					s.evaluateAndSignNodeCert(ctx, v.Node)
   524  				}
   525  				rootReconciler.UpdateNode(v.Node)
   526  			case api.EventDeleteNode:
   527  				rootReconciler.DeleteNode(v.Node)
   528  			case api.EventUpdateCluster:
   529  				if v.Cluster.ID == s.clusterID {
   530  					s.UpdateRootCA(ctx, v.Cluster, rootReconciler)
   531  				}
   532  			}
   533  		case <-externalTLSCredsChange:
   534  			// The TLS certificates can rotate independently of the root CA (and hence which roots the
   535  			// external CA trusts) and external CA URLs.  It's possible that the root CA update is received
   536  			// before the external TLS cred change notification.  During that period, it is possible that
   537  			// the TLS creds will expire or otherwise fail to authorize against external CAs.  However, in
   538  			// that case signing will just fail with a recoverable connectivity error - the state of the
   539  			// certificate issuance is left as pending, and on the next tick, the server will try to sign
   540  			// all nodes with pending certs again (by which time the TLS cred change will have been
   541  			// received).
   542  
   543  			// Note that if the external CA changes, the new external CA *MUST* trust the current server's
   544  			// certificate issuer, and this server's certificates should not be extremely close to expiry,
   545  			// otherwise this server would not be able to get new TLS certificates and will no longer be
   546  			// able to function.
   547  			s.signingMu.Lock()
   548  			s.externalCA.UpdateTLSConfig(NewExternalCATLSConfig(
   549  				s.securityConfig.ClientTLSCreds.Config().Certificates, s.externalCAPool))
   550  			s.signingMu.Unlock()
   551  		case <-ticker.C:
   552  			for _, node := range s.pending {
   553  				if err := s.evaluateAndSignNodeCert(ctx, node); err != nil {
   554  					// If this sign operation did not succeed, the rest are
   555  					// unlikely to. Yield so that we don't hammer an external CA.
   556  					// Since the map iteration order is randomized, there is no
   557  					// risk of getting stuck on a problematic CSR.
   558  					break
   559  				}
   560  			}
   561  		case <-ctx.Done():
   562  			return nil
   563  		}
   564  	}
   565  }
   566  
   567  // Stop stops the CA and closes all grpc streams.
   568  func (s *Server) Stop() error {
   569  	s.mu.Lock()
   570  
   571  	if !s.isRunning() {
   572  		s.mu.Unlock()
   573  		return errors.New("CA signer is already stopped")
   574  	}
   575  	s.cancel()
   576  	s.started = make(chan struct{})
   577  	s.joinTokens = nil
   578  	s.mu.Unlock()
   579  
   580  	// Wait for Run to complete
   581  	s.wg.Wait()
   582  
   583  	return nil
   584  }
   585  
   586  // Ready waits on the ready channel and returns when the server is ready to serve.
   587  func (s *Server) Ready() <-chan struct{} {
   588  	s.mu.Lock()
   589  	defer s.mu.Unlock()
   590  	return s.started
   591  }
   592  
   593  func (s *Server) isRunningLocked() (context.Context, error) {
   594  	s.mu.Lock()
   595  	if !s.isRunning() {
   596  		s.mu.Unlock()
   597  		return nil, status.Errorf(codes.Aborted, "CA signer is stopped")
   598  	}
   599  	ctx := s.ctx
   600  	s.mu.Unlock()
   601  	return ctx, nil
   602  }
   603  
   604  func (s *Server) isReadyLocked() error {
   605  	s.mu.Lock()
   606  	defer s.mu.Unlock()
   607  	if !s.isRunning() {
   608  		return status.Errorf(codes.Aborted, "CA signer is stopped")
   609  	}
   610  	if s.joinTokens == nil {
   611  		return status.Errorf(codes.Aborted, "CA signer is still starting")
   612  	}
   613  	return nil
   614  }
   615  
   616  func (s *Server) isRunning() bool {
   617  	if s.ctx == nil {
   618  		return false
   619  	}
   620  	select {
   621  	case <-s.ctx.Done():
   622  		return false
   623  	default:
   624  	}
   625  	return true
   626  }
   627  
   628  // filterExternalCAURLS returns a list of external CA urls filtered by the desired cert.
   629  func filterExternalCAURLS(ctx context.Context, desiredCert, defaultCert []byte, apiExternalCAs []*api.ExternalCA) (urls []string) {
   630  	desiredCert = NormalizePEMs(desiredCert)
   631  
   632  	// TODO(aaronl): In the future, this will be abstracted with an ExternalCA interface that has different
   633  	// implementations for different CA types. At the moment, only CFSSL is supported.
   634  	for i, extCA := range apiExternalCAs {
   635  		// We want to support old external CA specifications which did not have a CA cert.  If there is no cert specified,
   636  		// we assume it's the old cert
   637  		certForExtCA := extCA.CACert
   638  		if len(certForExtCA) == 0 {
   639  			certForExtCA = defaultCert
   640  		}
   641  		certForExtCA = NormalizePEMs(certForExtCA)
   642  		if extCA.Protocol != api.ExternalCA_CAProtocolCFSSL {
   643  			log.G(ctx).Debugf("skipping external CA %d (url: %s) due to unknown protocol type", i, extCA.URL)
   644  			continue
   645  		}
   646  		if !bytes.Equal(certForExtCA, desiredCert) {
   647  			log.G(ctx).Debugf("skipping external CA %d (url: %s) because it has the wrong CA cert", i, extCA.URL)
   648  			continue
   649  		}
   650  		urls = append(urls, extCA.URL)
   651  	}
   652  	return
   653  }
   654  
   655  // UpdateRootCA is called when there are cluster changes, and it ensures that the local RootCA is
   656  // always aware of changes in clusterExpiry and the Root CA key material - this can be called by
   657  // anything to update the root CA material
   658  func (s *Server) UpdateRootCA(ctx context.Context, cluster *api.Cluster, reconciler *rootRotationReconciler) error {
   659  	s.mu.Lock()
   660  	s.joinTokens = cluster.RootCA.JoinTokens.Copy()
   661  	s.mu.Unlock()
   662  	rCA := cluster.RootCA.Copy()
   663  	if reconciler != nil {
   664  		reconciler.UpdateRootCA(rCA)
   665  	}
   666  
   667  	s.signingMu.Lock()
   668  	defer s.signingMu.Unlock()
   669  	firstSeenCluster := s.lastSeenClusterRootCA == nil && s.lastSeenExternalCAs == nil
   670  	rootCAChanged := len(rCA.CACert) != 0 && !equality.RootCAEqualStable(s.lastSeenClusterRootCA, rCA)
   671  	externalCAChanged := !equality.ExternalCAsEqualStable(s.lastSeenExternalCAs, cluster.Spec.CAConfig.ExternalCAs)
   672  	ctx = log.WithLogger(ctx, log.G(ctx).WithFields(logrus.Fields{
   673  		"cluster.id": cluster.ID,
   674  		"method":     "(*Server).UpdateRootCA",
   675  	}))
   676  
   677  	if rootCAChanged {
   678  		setOrUpdate := "set"
   679  		if !firstSeenCluster {
   680  			log.G(ctx).Debug("Updating signing root CA and external CA due to change in cluster Root CA")
   681  			setOrUpdate = "updated"
   682  		}
   683  		expiry := DefaultNodeCertExpiration
   684  		if cluster.Spec.CAConfig.NodeCertExpiry != nil {
   685  			// NodeCertExpiry exists, let's try to parse the duration out of it
   686  			clusterExpiry, err := gogotypes.DurationFromProto(cluster.Spec.CAConfig.NodeCertExpiry)
   687  			if err != nil {
   688  				log.G(ctx).WithError(err).Warn("failed to parse certificate expiration, using default")
   689  			} else {
   690  				// We were able to successfully parse the expiration out of the cluster.
   691  				expiry = clusterExpiry
   692  			}
   693  		} else {
   694  			// NodeCertExpiry seems to be nil
   695  			log.G(ctx).Warn("no certificate expiration specified, using default")
   696  		}
   697  		// Attempt to update our local RootCA with the new parameters
   698  		updatedRootCA, err := RootCAFromAPI(ctx, rCA, expiry)
   699  		if err != nil {
   700  			return errors.Wrap(err, "invalid Root CA object in cluster")
   701  		}
   702  
   703  		s.localRootCA = &updatedRootCA
   704  		s.externalCAPool = updatedRootCA.Pool
   705  		externalCACert := rCA.CACert
   706  		if rCA.RootRotation != nil {
   707  			externalCACert = rCA.RootRotation.CACert
   708  			// the external CA has to trust the new CA cert
   709  			s.externalCAPool = x509.NewCertPool()
   710  			s.externalCAPool.AppendCertsFromPEM(rCA.CACert)
   711  			s.externalCAPool.AppendCertsFromPEM(rCA.RootRotation.CACert)
   712  		}
   713  		s.lastSeenExternalCAs = cluster.Spec.CAConfig.Copy().ExternalCAs
   714  		urls := filterExternalCAURLS(ctx, externalCACert, rCA.CACert, s.lastSeenExternalCAs)
   715  		// Replace the external CA with the relevant intermediates, URLS, and TLS config
   716  		s.externalCA = NewExternalCA(updatedRootCA.Intermediates,
   717  			NewExternalCATLSConfig(s.securityConfig.ClientTLSCreds.Config().Certificates, s.externalCAPool), urls...)
   718  
   719  		// only update the server cache if we've successfully updated the root CA
   720  		log.G(ctx).Debugf("Root CA %s successfully", setOrUpdate)
   721  		s.lastSeenClusterRootCA = rCA
   722  	} else if externalCAChanged {
   723  		// we want to update only if the external CA URLS have changed, since if the root CA has changed we already
   724  		// run similar logic
   725  		if !firstSeenCluster {
   726  			log.G(ctx).Debug("Updating security config external CA URLs due to change in cluster spec's list of external CAs")
   727  		}
   728  		wantedExternalCACert := rCA.CACert // we want to only add external CA URLs that use this cert
   729  		if rCA.RootRotation != nil {
   730  			// we're rotating to a new root, so we only want external CAs with the new root cert
   731  			wantedExternalCACert = rCA.RootRotation.CACert
   732  		}
   733  		// Update our external CA with the list of External CA URLs from the new cluster state
   734  		s.lastSeenExternalCAs = cluster.Spec.CAConfig.Copy().ExternalCAs
   735  		urls := filterExternalCAURLS(ctx, wantedExternalCACert, rCA.CACert, s.lastSeenExternalCAs)
   736  		s.externalCA.UpdateURLs(urls...)
   737  	}
   738  	return nil
   739  }
   740  
   741  // evaluateAndSignNodeCert implements the logic of which certificates to sign
   742  func (s *Server) evaluateAndSignNodeCert(ctx context.Context, node *api.Node) error {
   743  	// If the desired membership and actual state are in sync, there's
   744  	// nothing to do.
   745  	certState := node.Certificate.Status.State
   746  	if node.Spec.Membership == api.NodeMembershipAccepted &&
   747  		(certState == api.IssuanceStateIssued || certState == api.IssuanceStateRotate) {
   748  		return nil
   749  	}
   750  
   751  	// If the certificate state is renew, then it is a server-sided accepted cert (cert renewals)
   752  	if certState == api.IssuanceStateRenew {
   753  		return s.signNodeCert(ctx, node)
   754  	}
   755  
   756  	// Sign this certificate if a user explicitly changed it to Accepted, and
   757  	// the certificate is in pending state
   758  	if node.Spec.Membership == api.NodeMembershipAccepted && certState == api.IssuanceStatePending {
   759  		return s.signNodeCert(ctx, node)
   760  	}
   761  
   762  	return nil
   763  }
   764  
   765  // signNodeCert does the bulk of the work for signing a certificate
   766  func (s *Server) signNodeCert(ctx context.Context, node *api.Node) error {
   767  	s.signingMu.Lock()
   768  	rootCA := s.localRootCA
   769  	externalCA := s.externalCA
   770  	s.signingMu.Unlock()
   771  
   772  	node = node.Copy()
   773  	nodeID := node.ID
   774  	// Convert the role from proto format
   775  	role, err := ParseRole(node.Certificate.Role)
   776  	if err != nil {
   777  		log.G(ctx).WithFields(logrus.Fields{
   778  			"node.id": node.ID,
   779  			"method":  "(*Server).signNodeCert",
   780  		}).WithError(err).Errorf("failed to parse role")
   781  		return errors.New("failed to parse role")
   782  	}
   783  
   784  	s.pending[node.ID] = node
   785  
   786  	// Attempt to sign the CSR
   787  	var (
   788  		rawCSR = node.Certificate.CSR
   789  		cn     = node.Certificate.CN
   790  		ou     = role
   791  		org    = s.clusterID
   792  	)
   793  
   794  	// Try using the external CA first.
   795  	cert, err := externalCA.Sign(ctx, PrepareCSR(rawCSR, cn, ou, org))
   796  	if err == ErrNoExternalCAURLs {
   797  		// No external CA servers configured. Try using the local CA.
   798  		cert, err = rootCA.ParseValidateAndSignCSR(rawCSR, cn, ou, org)
   799  	}
   800  
   801  	if err != nil {
   802  		log.G(ctx).WithFields(logrus.Fields{
   803  			"node.id": node.ID,
   804  			"method":  "(*Server).signNodeCert",
   805  		}).WithError(err).Errorf("failed to sign CSR")
   806  
   807  		// If the current state is already Failed, no need to change it
   808  		if node.Certificate.Status.State == api.IssuanceStateFailed {
   809  			delete(s.pending, node.ID)
   810  			return errors.New("failed to sign CSR")
   811  		}
   812  
   813  		if _, ok := err.(recoverableErr); ok {
   814  			// Return without changing the state of the certificate. We may
   815  			// retry signing it in the future.
   816  			return errors.New("failed to sign CSR")
   817  		}
   818  
   819  		// We failed to sign this CSR, change the state to FAILED
   820  		err = s.store.Update(func(tx store.Tx) error {
   821  			node := store.GetNode(tx, nodeID)
   822  			if node == nil {
   823  				return errors.Errorf("node %s not found", nodeID)
   824  			}
   825  
   826  			node.Certificate.Status = api.IssuanceStatus{
   827  				State: api.IssuanceStateFailed,
   828  				Err:   err.Error(),
   829  			}
   830  
   831  			return store.UpdateNode(tx, node)
   832  		})
   833  		if err != nil {
   834  			log.G(ctx).WithFields(logrus.Fields{
   835  				"node.id": nodeID,
   836  				"method":  "(*Server).signNodeCert",
   837  			}).WithError(err).Errorf("transaction failed when setting state to FAILED")
   838  		}
   839  
   840  		delete(s.pending, node.ID)
   841  		return errors.New("failed to sign CSR")
   842  	}
   843  
   844  	// We were able to successfully sign the new CSR. Let's try to update the nodeStore
   845  	for {
   846  		err = s.store.Update(func(tx store.Tx) error {
   847  			node.Certificate.Certificate = cert
   848  			node.Certificate.Status = api.IssuanceStatus{
   849  				State: api.IssuanceStateIssued,
   850  			}
   851  
   852  			err := store.UpdateNode(tx, node)
   853  			if err != nil {
   854  				node = store.GetNode(tx, nodeID)
   855  				if node == nil {
   856  					err = errors.Errorf("node %s does not exist", nodeID)
   857  				}
   858  			}
   859  			return err
   860  		})
   861  		if err == nil {
   862  			log.G(ctx).WithFields(logrus.Fields{
   863  				"node.id":   node.ID,
   864  				"node.role": node.Certificate.Role,
   865  				"method":    "(*Server).signNodeCert",
   866  			}).Debugf("certificate issued")
   867  			delete(s.pending, node.ID)
   868  			break
   869  		}
   870  		if err == store.ErrSequenceConflict {
   871  			continue
   872  		}
   873  
   874  		log.G(ctx).WithFields(logrus.Fields{
   875  			"node.id": nodeID,
   876  			"method":  "(*Server).signNodeCert",
   877  		}).WithError(err).Errorf("transaction failed")
   878  		return errors.New("transaction failed")
   879  	}
   880  	return nil
   881  }
   882  
   883  // reconcileNodeCertificates is a helper method that calls evaluateAndSignNodeCert on all the
   884  // nodes.
   885  func (s *Server) reconcileNodeCertificates(ctx context.Context, nodes []*api.Node) error {
   886  	for _, node := range nodes {
   887  		s.evaluateAndSignNodeCert(ctx, node)
   888  	}
   889  
   890  	return nil
   891  }
   892  
   893  // A successfully issued certificate and a failed certificate are our current final states
   894  func isFinalState(status api.IssuanceStatus) bool {
   895  	if status.State == api.IssuanceStateIssued || status.State == api.IssuanceStateFailed ||
   896  		status.State == api.IssuanceStateRotate {
   897  		return true
   898  	}
   899  
   900  	return false
   901  }
   902  
   903  // RootCAFromAPI creates a RootCA object from an api.RootCA object
   904  func RootCAFromAPI(ctx context.Context, apiRootCA *api.RootCA, expiry time.Duration) (RootCA, error) {
   905  	var intermediates []byte
   906  	signingCert := apiRootCA.CACert
   907  	signingKey := apiRootCA.CAKey
   908  	if apiRootCA.RootRotation != nil {
   909  		signingCert = apiRootCA.RootRotation.CrossSignedCACert
   910  		signingKey = apiRootCA.RootRotation.CAKey
   911  		intermediates = apiRootCA.RootRotation.CrossSignedCACert
   912  	}
   913  	if signingKey == nil {
   914  		signingCert = nil
   915  	}
   916  	return NewRootCA(apiRootCA.CACert, signingCert, signingKey, expiry, intermediates)
   917  }