github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/ca/server.go (about) 1 package ca 2 3 import ( 4 "bytes" 5 "context" 6 "crypto/subtle" 7 "crypto/x509" 8 "sync" 9 "time" 10 11 "github.com/docker/swarmkit/api" 12 "github.com/docker/swarmkit/api/equality" 13 "github.com/docker/swarmkit/identity" 14 "github.com/docker/swarmkit/log" 15 "github.com/docker/swarmkit/manager/state/store" 16 gogotypes "github.com/gogo/protobuf/types" 17 "github.com/pkg/errors" 18 "github.com/sirupsen/logrus" 19 "google.golang.org/grpc/codes" 20 "google.golang.org/grpc/status" 21 ) 22 23 const ( 24 defaultReconciliationRetryInterval = 10 * time.Second 25 defaultRootReconciliationInterval = 3 * time.Second 26 ) 27 28 // Server is the CA and NodeCA API gRPC server. 29 // TODO(aaronl): At some point we may want to have separate implementations of 30 // CA, NodeCA, and other hypothetical future CA services. At the moment, 31 // breaking it apart doesn't seem worth it. 32 type Server struct { 33 mu sync.Mutex 34 wg sync.WaitGroup 35 ctx context.Context 36 cancel func() 37 store *store.MemoryStore 38 securityConfig *SecurityConfig 39 clusterID string 40 localRootCA *RootCA 41 externalCA *ExternalCA 42 externalCAPool *x509.CertPool 43 joinTokens *api.JoinTokens 44 reconciliationRetryInterval time.Duration 45 46 // pending is a map of nodes with pending certificates issuance or 47 // renewal. They are indexed by node ID. 48 pending map[string]*api.Node 49 50 // started is a channel which gets closed once the server is running 51 // and able to service RPCs. 52 started chan struct{} 53 54 // these are cached values to ensure we only update the security config when 55 // the cluster root CA and external CAs have changed - the cluster object 56 // can change for other reasons, and it would not be necessary to update 57 // the security config as a result 58 lastSeenClusterRootCA *api.RootCA 59 lastSeenExternalCAs []*api.ExternalCA 60 61 // This mutex protects the components of the CA server used to issue new certificates 62 // (and any attributes used to update those components): `lastSeenClusterRootCA` and 63 // `lastSeenExternalCA`, which are used to update `externalCA` and the `rootCA` object 64 // of the SecurityConfig 65 signingMu sync.Mutex 66 67 // lets us monitor and finish root rotations 68 rootReconciler *rootRotationReconciler 69 rootReconciliationRetryInterval time.Duration 70 } 71 72 // DefaultCAConfig returns the default CA Config, with a default expiration. 73 func DefaultCAConfig() api.CAConfig { 74 return api.CAConfig{ 75 NodeCertExpiry: gogotypes.DurationProto(DefaultNodeCertExpiration), 76 } 77 } 78 79 // NewServer creates a CA API server. 80 func NewServer(store *store.MemoryStore, securityConfig *SecurityConfig) *Server { 81 return &Server{ 82 store: store, 83 securityConfig: securityConfig, 84 localRootCA: securityConfig.RootCA(), 85 externalCA: NewExternalCA(nil, nil), 86 pending: make(map[string]*api.Node), 87 started: make(chan struct{}), 88 reconciliationRetryInterval: defaultReconciliationRetryInterval, 89 rootReconciliationRetryInterval: defaultRootReconciliationInterval, 90 clusterID: securityConfig.ClientTLSCreds.Organization(), 91 } 92 } 93 94 // ExternalCA returns the current external CA - this is exposed to support unit testing only, and the external CA 95 // should really be a private field 96 func (s *Server) ExternalCA() *ExternalCA { 97 s.signingMu.Lock() 98 defer s.signingMu.Unlock() 99 return s.externalCA 100 } 101 102 // RootCA returns the current local root CA - this is exposed to support unit testing only, and the root CA 103 // should really be a private field 104 func (s *Server) RootCA() *RootCA { 105 s.signingMu.Lock() 106 defer s.signingMu.Unlock() 107 return s.localRootCA 108 } 109 110 // SetReconciliationRetryInterval changes the time interval between 111 // reconciliation attempts. This function must be called before Run. 112 func (s *Server) SetReconciliationRetryInterval(reconciliationRetryInterval time.Duration) { 113 s.reconciliationRetryInterval = reconciliationRetryInterval 114 } 115 116 // SetRootReconciliationInterval changes the time interval between root rotation 117 // reconciliation attempts. This function must be called before Run. 118 func (s *Server) SetRootReconciliationInterval(interval time.Duration) { 119 s.rootReconciliationRetryInterval = interval 120 } 121 122 // GetUnlockKey is responsible for returning the current unlock key used for encrypting TLS private keys and 123 // other at rest data. Access to this RPC call should only be allowed via mutual TLS from managers. 124 func (s *Server) GetUnlockKey(ctx context.Context, request *api.GetUnlockKeyRequest) (*api.GetUnlockKeyResponse, error) { 125 // This directly queries the store, rather than storing the unlock key and version on 126 // the `Server` object and updating it `updateCluster` is called, because we need this 127 // API to return the latest version of the key. Otherwise, there might be a slight delay 128 // between when the cluster gets updated, and when this function returns the latest key. 129 // This delay is currently unacceptable because this RPC call is the only way, after a 130 // cluster update, to get the actual value of the unlock key, and we don't want to return 131 // a cached value. 132 resp := api.GetUnlockKeyResponse{} 133 s.store.View(func(tx store.ReadTx) { 134 cluster := store.GetCluster(tx, s.clusterID) 135 resp.Version = cluster.Meta.Version 136 if cluster.Spec.EncryptionConfig.AutoLockManagers { 137 for _, encryptionKey := range cluster.UnlockKeys { 138 if encryptionKey.Subsystem == ManagerRole { 139 resp.UnlockKey = encryptionKey.Key 140 return 141 } 142 } 143 } 144 }) 145 146 return &resp, nil 147 } 148 149 // NodeCertificateStatus returns the current issuance status of an issuance request identified by the nodeID 150 func (s *Server) NodeCertificateStatus(ctx context.Context, request *api.NodeCertificateStatusRequest) (*api.NodeCertificateStatusResponse, error) { 151 if request.NodeID == "" { 152 return nil, status.Errorf(codes.InvalidArgument, codes.InvalidArgument.String()) 153 } 154 155 serverCtx, err := s.isRunningLocked() 156 if err != nil { 157 return nil, err 158 } 159 160 var node *api.Node 161 162 event := api.EventUpdateNode{ 163 Node: &api.Node{ID: request.NodeID}, 164 Checks: []api.NodeCheckFunc{api.NodeCheckID}, 165 } 166 167 // Retrieve the current value of the certificate with this token, and create a watcher 168 updates, cancel, err := store.ViewAndWatch( 169 s.store, 170 func(tx store.ReadTx) error { 171 node = store.GetNode(tx, request.NodeID) 172 return nil 173 }, 174 event, 175 ) 176 if err != nil { 177 return nil, err 178 } 179 defer cancel() 180 181 // This node ID doesn't exist 182 if node == nil { 183 return nil, status.Errorf(codes.NotFound, codes.NotFound.String()) 184 } 185 186 log.G(ctx).WithFields(logrus.Fields{ 187 "node.id": node.ID, 188 "status": node.Certificate.Status, 189 "method": "NodeCertificateStatus", 190 }) 191 192 // If this certificate has a final state, return it immediately (both pending and renew are transition states) 193 if isFinalState(node.Certificate.Status) { 194 return &api.NodeCertificateStatusResponse{ 195 Status: &node.Certificate.Status, 196 Certificate: &node.Certificate, 197 }, nil 198 } 199 200 log.G(ctx).WithFields(logrus.Fields{ 201 "node.id": node.ID, 202 "status": node.Certificate.Status, 203 "method": "NodeCertificateStatus", 204 }).Debugf("started watching for certificate updates") 205 206 // Certificate is Pending or in an Unknown state, let's wait for changes. 207 for { 208 select { 209 case event := <-updates: 210 switch v := event.(type) { 211 case api.EventUpdateNode: 212 // We got an update on the certificate record. If the status is a final state, 213 // return the certificate. 214 if isFinalState(v.Node.Certificate.Status) { 215 cert := v.Node.Certificate.Copy() 216 return &api.NodeCertificateStatusResponse{ 217 Status: &cert.Status, 218 Certificate: cert, 219 }, nil 220 } 221 } 222 case <-ctx.Done(): 223 return nil, ctx.Err() 224 case <-serverCtx.Done(): 225 return nil, s.ctx.Err() 226 } 227 } 228 } 229 230 // IssueNodeCertificate is responsible for gatekeeping both certificate requests from new nodes in the swarm, 231 // and authorizing certificate renewals. 232 // If a node presented a valid certificate, the corresponding certificate is set in a RENEW state. 233 // If a node failed to present a valid certificate, we check for a valid join token and set the 234 // role accordingly. A new random node ID is generated, and the corresponding node entry is created. 235 // IssueNodeCertificate is the only place where new node entries to raft should be created. 236 func (s *Server) IssueNodeCertificate(ctx context.Context, request *api.IssueNodeCertificateRequest) (*api.IssueNodeCertificateResponse, error) { 237 // First, let's see if the remote node is presenting a non-empty CSR 238 if len(request.CSR) == 0 { 239 return nil, status.Errorf(codes.InvalidArgument, codes.InvalidArgument.String()) 240 } 241 242 if err := s.isReadyLocked(); err != nil { 243 return nil, err 244 } 245 246 var ( 247 blacklistedCerts map[string]*api.BlacklistedCertificate 248 clusters []*api.Cluster 249 err error 250 ) 251 252 s.store.View(func(readTx store.ReadTx) { 253 clusters, err = store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) 254 }) 255 256 // Not having a cluster object yet means we can't check 257 // the blacklist. 258 if err == nil && len(clusters) == 1 { 259 blacklistedCerts = clusters[0].BlacklistedCertificates 260 } 261 262 // Renewing the cert with a local (unix socket) is always valid. 263 localNodeInfo := ctx.Value(LocalRequestKey) 264 if localNodeInfo != nil { 265 nodeInfo, ok := localNodeInfo.(RemoteNodeInfo) 266 if ok && nodeInfo.NodeID != "" { 267 return s.issueRenewCertificate(ctx, nodeInfo.NodeID, request.CSR) 268 } 269 } 270 271 // If the remote node is a worker (either forwarded by a manager, or calling directly), 272 // issue a renew worker certificate entry with the correct ID 273 nodeID, err := AuthorizeForwardedRoleAndOrg(ctx, []string{WorkerRole}, []string{ManagerRole}, s.clusterID, blacklistedCerts) 274 if err == nil { 275 return s.issueRenewCertificate(ctx, nodeID, request.CSR) 276 } 277 278 // If the remote node is a manager (either forwarded by another manager, or calling directly), 279 // issue a renew certificate entry with the correct ID 280 nodeID, err = AuthorizeForwardedRoleAndOrg(ctx, []string{ManagerRole}, []string{ManagerRole}, s.clusterID, blacklistedCerts) 281 if err == nil { 282 return s.issueRenewCertificate(ctx, nodeID, request.CSR) 283 } 284 285 // The remote node didn't successfully present a valid MTLS certificate, let's issue a 286 // certificate with a new random ID 287 role := api.NodeRole(-1) 288 289 s.mu.Lock() 290 if subtle.ConstantTimeCompare([]byte(s.joinTokens.Manager), []byte(request.Token)) == 1 { 291 role = api.NodeRoleManager 292 } else if subtle.ConstantTimeCompare([]byte(s.joinTokens.Worker), []byte(request.Token)) == 1 { 293 role = api.NodeRoleWorker 294 } 295 s.mu.Unlock() 296 297 if role < 0 { 298 return nil, status.Errorf(codes.InvalidArgument, "A valid join token is necessary to join this cluster") 299 } 300 301 // Max number of collisions of ID or CN to tolerate before giving up 302 maxRetries := 3 303 // Generate a random ID for this new node 304 for i := 0; ; i++ { 305 nodeID = identity.NewID() 306 307 // Create a new node 308 err := s.store.Update(func(tx store.Tx) error { 309 node := &api.Node{ 310 Role: role, 311 ID: nodeID, 312 Certificate: api.Certificate{ 313 CSR: request.CSR, 314 CN: nodeID, 315 Role: role, 316 Status: api.IssuanceStatus{ 317 State: api.IssuanceStatePending, 318 }, 319 }, 320 Spec: api.NodeSpec{ 321 DesiredRole: role, 322 Membership: api.NodeMembershipAccepted, 323 Availability: request.Availability, 324 }, 325 } 326 node.VXLANUDPPort = clusters[0].VXLANUDPPort 327 return store.CreateNode(tx, node) 328 }) 329 if err == nil { 330 log.G(ctx).WithFields(logrus.Fields{ 331 "node.id": nodeID, 332 "node.role": role, 333 "method": "IssueNodeCertificate", 334 }).Debugf("new certificate entry added") 335 break 336 } 337 if err != store.ErrExist { 338 return nil, err 339 } 340 if i == maxRetries { 341 return nil, err 342 } 343 log.G(ctx).WithFields(logrus.Fields{ 344 "node.id": nodeID, 345 "node.role": role, 346 "method": "IssueNodeCertificate", 347 }).Errorf("randomly generated node ID collided with an existing one - retrying") 348 } 349 350 return &api.IssueNodeCertificateResponse{ 351 NodeID: nodeID, 352 NodeMembership: api.NodeMembershipAccepted, 353 }, nil 354 } 355 356 // issueRenewCertificate receives a nodeID and a CSR and modifies the node's certificate entry with the new CSR 357 // and changes the state to RENEW, so it can be picked up and signed by the signing reconciliation loop 358 func (s *Server) issueRenewCertificate(ctx context.Context, nodeID string, csr []byte) (*api.IssueNodeCertificateResponse, error) { 359 var ( 360 cert api.Certificate 361 node *api.Node 362 ) 363 err := s.store.Update(func(tx store.Tx) error { 364 // Attempt to retrieve the node with nodeID 365 node = store.GetNode(tx, nodeID) 366 if node == nil { 367 log.G(ctx).WithFields(logrus.Fields{ 368 "node.id": nodeID, 369 "method": "issueRenewCertificate", 370 }).Warnf("node does not exist") 371 // If this node doesn't exist, we shouldn't be renewing a certificate for it 372 return status.Errorf(codes.NotFound, "node %s not found when attempting to renew certificate", nodeID) 373 } 374 375 // Create a new Certificate entry for this node with the new CSR and a RENEW state 376 cert = api.Certificate{ 377 CSR: csr, 378 CN: node.ID, 379 Role: node.Role, 380 Status: api.IssuanceStatus{ 381 State: api.IssuanceStateRenew, 382 }, 383 } 384 385 node.Certificate = cert 386 return store.UpdateNode(tx, node) 387 }) 388 if err != nil { 389 return nil, err 390 } 391 392 log.G(ctx).WithFields(logrus.Fields{ 393 "cert.cn": cert.CN, 394 "cert.role": cert.Role, 395 "method": "issueRenewCertificate", 396 }).Debugf("node certificate updated") 397 398 return &api.IssueNodeCertificateResponse{ 399 NodeID: nodeID, 400 NodeMembership: node.Spec.Membership, 401 }, nil 402 } 403 404 // GetRootCACertificate returns the certificate of the Root CA. It is used as a convenience for distributing 405 // the root of trust for the swarm. Clients should be using the CA hash to verify if they weren't target to 406 // a MiTM. If they fail to do so, node bootstrap works with TOFU semantics. 407 func (s *Server) GetRootCACertificate(ctx context.Context, request *api.GetRootCACertificateRequest) (*api.GetRootCACertificateResponse, error) { 408 log.G(ctx).WithFields(logrus.Fields{ 409 "method": "GetRootCACertificate", 410 }) 411 412 s.signingMu.Lock() 413 defer s.signingMu.Unlock() 414 415 return &api.GetRootCACertificateResponse{ 416 Certificate: s.localRootCA.Certs, 417 }, nil 418 } 419 420 // Run runs the CA signer main loop. 421 // The CA signer can be stopped with cancelling ctx or calling Stop(). 422 func (s *Server) Run(ctx context.Context) error { 423 s.mu.Lock() 424 if s.isRunning() { 425 s.mu.Unlock() 426 return errors.New("CA signer is already running") 427 } 428 s.wg.Add(1) 429 s.ctx, s.cancel = context.WithCancel(log.WithModule(ctx, "ca")) 430 ctx = s.ctx 431 s.mu.Unlock() 432 defer s.wg.Done() 433 defer func() { 434 s.mu.Lock() 435 s.mu.Unlock() 436 }() 437 438 // Retrieve the channels to keep track of changes in the cluster 439 // Retrieve all the currently registered nodes 440 var ( 441 nodes []*api.Node 442 cluster *api.Cluster 443 err error 444 ) 445 updates, cancel, err := store.ViewAndWatch( 446 s.store, 447 func(readTx store.ReadTx) error { 448 cluster = store.GetCluster(readTx, s.clusterID) 449 if cluster == nil { 450 return errors.New("could not find cluster object") 451 } 452 nodes, err = store.FindNodes(readTx, store.All) 453 return err 454 }, 455 api.EventCreateNode{}, 456 api.EventUpdateNode{}, 457 api.EventDeleteNode{}, 458 api.EventUpdateCluster{ 459 Cluster: &api.Cluster{ID: s.clusterID}, 460 Checks: []api.ClusterCheckFunc{api.ClusterCheckID}, 461 }, 462 ) 463 464 // call once to ensure that the join tokens and local/external CA signer are always set 465 rootReconciler := &rootRotationReconciler{ 466 ctx: log.WithField(ctx, "method", "(*Server).rootRotationReconciler"), 467 clusterID: s.clusterID, 468 store: s.store, 469 batchUpdateInterval: s.rootReconciliationRetryInterval, 470 } 471 472 s.UpdateRootCA(ctx, cluster, rootReconciler) 473 474 // Do this after updateCluster has been called, so Ready() and isRunning never returns true without 475 // the join tokens and external CA/security config's root CA being set correctly 476 s.mu.Lock() 477 close(s.started) 478 s.mu.Unlock() 479 480 if err != nil { 481 log.G(ctx).WithFields(logrus.Fields{ 482 "method": "(*Server).Run", 483 }).WithError(err).Errorf("snapshot store view failed") 484 return err 485 } 486 defer cancel() 487 488 // We might have missed some updates if there was a leader election, 489 // so let's pick up the slack. 490 if err := s.reconcileNodeCertificates(ctx, nodes); err != nil { 491 // We don't return here because that means the Run loop would 492 // never run. Log an error instead. 493 log.G(ctx).WithFields(logrus.Fields{ 494 "method": "(*Server).Run", 495 }).WithError(err).Errorf("error attempting to reconcile certificates") 496 } 497 498 ticker := time.NewTicker(s.reconciliationRetryInterval) 499 defer ticker.Stop() 500 501 externalTLSCredsChange, externalTLSWatchCancel := s.securityConfig.Watch() 502 defer externalTLSWatchCancel() 503 504 // Watch for new nodes being created, new nodes being updated, and changes 505 // to the cluster 506 for { 507 select { 508 case <-ctx.Done(): 509 return nil 510 default: 511 } 512 513 select { 514 case event := <-updates: 515 switch v := event.(type) { 516 case api.EventCreateNode: 517 s.evaluateAndSignNodeCert(ctx, v.Node) 518 rootReconciler.UpdateNode(v.Node) 519 case api.EventUpdateNode: 520 // If this certificate is already at a final state 521 // no need to evaluate and sign it. 522 if !isFinalState(v.Node.Certificate.Status) { 523 s.evaluateAndSignNodeCert(ctx, v.Node) 524 } 525 rootReconciler.UpdateNode(v.Node) 526 case api.EventDeleteNode: 527 rootReconciler.DeleteNode(v.Node) 528 case api.EventUpdateCluster: 529 if v.Cluster.ID == s.clusterID { 530 s.UpdateRootCA(ctx, v.Cluster, rootReconciler) 531 } 532 } 533 case <-externalTLSCredsChange: 534 // The TLS certificates can rotate independently of the root CA (and hence which roots the 535 // external CA trusts) and external CA URLs. It's possible that the root CA update is received 536 // before the external TLS cred change notification. During that period, it is possible that 537 // the TLS creds will expire or otherwise fail to authorize against external CAs. However, in 538 // that case signing will just fail with a recoverable connectivity error - the state of the 539 // certificate issuance is left as pending, and on the next tick, the server will try to sign 540 // all nodes with pending certs again (by which time the TLS cred change will have been 541 // received). 542 543 // Note that if the external CA changes, the new external CA *MUST* trust the current server's 544 // certificate issuer, and this server's certificates should not be extremely close to expiry, 545 // otherwise this server would not be able to get new TLS certificates and will no longer be 546 // able to function. 547 s.signingMu.Lock() 548 s.externalCA.UpdateTLSConfig(NewExternalCATLSConfig( 549 s.securityConfig.ClientTLSCreds.Config().Certificates, s.externalCAPool)) 550 s.signingMu.Unlock() 551 case <-ticker.C: 552 for _, node := range s.pending { 553 if err := s.evaluateAndSignNodeCert(ctx, node); err != nil { 554 // If this sign operation did not succeed, the rest are 555 // unlikely to. Yield so that we don't hammer an external CA. 556 // Since the map iteration order is randomized, there is no 557 // risk of getting stuck on a problematic CSR. 558 break 559 } 560 } 561 case <-ctx.Done(): 562 return nil 563 } 564 } 565 } 566 567 // Stop stops the CA and closes all grpc streams. 568 func (s *Server) Stop() error { 569 s.mu.Lock() 570 571 if !s.isRunning() { 572 s.mu.Unlock() 573 return errors.New("CA signer is already stopped") 574 } 575 s.cancel() 576 s.started = make(chan struct{}) 577 s.joinTokens = nil 578 s.mu.Unlock() 579 580 // Wait for Run to complete 581 s.wg.Wait() 582 583 return nil 584 } 585 586 // Ready waits on the ready channel and returns when the server is ready to serve. 587 func (s *Server) Ready() <-chan struct{} { 588 s.mu.Lock() 589 defer s.mu.Unlock() 590 return s.started 591 } 592 593 func (s *Server) isRunningLocked() (context.Context, error) { 594 s.mu.Lock() 595 if !s.isRunning() { 596 s.mu.Unlock() 597 return nil, status.Errorf(codes.Aborted, "CA signer is stopped") 598 } 599 ctx := s.ctx 600 s.mu.Unlock() 601 return ctx, nil 602 } 603 604 func (s *Server) isReadyLocked() error { 605 s.mu.Lock() 606 defer s.mu.Unlock() 607 if !s.isRunning() { 608 return status.Errorf(codes.Aborted, "CA signer is stopped") 609 } 610 if s.joinTokens == nil { 611 return status.Errorf(codes.Aborted, "CA signer is still starting") 612 } 613 return nil 614 } 615 616 func (s *Server) isRunning() bool { 617 if s.ctx == nil { 618 return false 619 } 620 select { 621 case <-s.ctx.Done(): 622 return false 623 default: 624 } 625 return true 626 } 627 628 // filterExternalCAURLS returns a list of external CA urls filtered by the desired cert. 629 func filterExternalCAURLS(ctx context.Context, desiredCert, defaultCert []byte, apiExternalCAs []*api.ExternalCA) (urls []string) { 630 desiredCert = NormalizePEMs(desiredCert) 631 632 // TODO(aaronl): In the future, this will be abstracted with an ExternalCA interface that has different 633 // implementations for different CA types. At the moment, only CFSSL is supported. 634 for i, extCA := range apiExternalCAs { 635 // We want to support old external CA specifications which did not have a CA cert. If there is no cert specified, 636 // we assume it's the old cert 637 certForExtCA := extCA.CACert 638 if len(certForExtCA) == 0 { 639 certForExtCA = defaultCert 640 } 641 certForExtCA = NormalizePEMs(certForExtCA) 642 if extCA.Protocol != api.ExternalCA_CAProtocolCFSSL { 643 log.G(ctx).Debugf("skipping external CA %d (url: %s) due to unknown protocol type", i, extCA.URL) 644 continue 645 } 646 if !bytes.Equal(certForExtCA, desiredCert) { 647 log.G(ctx).Debugf("skipping external CA %d (url: %s) because it has the wrong CA cert", i, extCA.URL) 648 continue 649 } 650 urls = append(urls, extCA.URL) 651 } 652 return 653 } 654 655 // UpdateRootCA is called when there are cluster changes, and it ensures that the local RootCA is 656 // always aware of changes in clusterExpiry and the Root CA key material - this can be called by 657 // anything to update the root CA material 658 func (s *Server) UpdateRootCA(ctx context.Context, cluster *api.Cluster, reconciler *rootRotationReconciler) error { 659 s.mu.Lock() 660 s.joinTokens = cluster.RootCA.JoinTokens.Copy() 661 s.mu.Unlock() 662 rCA := cluster.RootCA.Copy() 663 if reconciler != nil { 664 reconciler.UpdateRootCA(rCA) 665 } 666 667 s.signingMu.Lock() 668 defer s.signingMu.Unlock() 669 firstSeenCluster := s.lastSeenClusterRootCA == nil && s.lastSeenExternalCAs == nil 670 rootCAChanged := len(rCA.CACert) != 0 && !equality.RootCAEqualStable(s.lastSeenClusterRootCA, rCA) 671 externalCAChanged := !equality.ExternalCAsEqualStable(s.lastSeenExternalCAs, cluster.Spec.CAConfig.ExternalCAs) 672 ctx = log.WithLogger(ctx, log.G(ctx).WithFields(logrus.Fields{ 673 "cluster.id": cluster.ID, 674 "method": "(*Server).UpdateRootCA", 675 })) 676 677 if rootCAChanged { 678 setOrUpdate := "set" 679 if !firstSeenCluster { 680 log.G(ctx).Debug("Updating signing root CA and external CA due to change in cluster Root CA") 681 setOrUpdate = "updated" 682 } 683 expiry := DefaultNodeCertExpiration 684 if cluster.Spec.CAConfig.NodeCertExpiry != nil { 685 // NodeCertExpiry exists, let's try to parse the duration out of it 686 clusterExpiry, err := gogotypes.DurationFromProto(cluster.Spec.CAConfig.NodeCertExpiry) 687 if err != nil { 688 log.G(ctx).WithError(err).Warn("failed to parse certificate expiration, using default") 689 } else { 690 // We were able to successfully parse the expiration out of the cluster. 691 expiry = clusterExpiry 692 } 693 } else { 694 // NodeCertExpiry seems to be nil 695 log.G(ctx).Warn("no certificate expiration specified, using default") 696 } 697 // Attempt to update our local RootCA with the new parameters 698 updatedRootCA, err := RootCAFromAPI(ctx, rCA, expiry) 699 if err != nil { 700 return errors.Wrap(err, "invalid Root CA object in cluster") 701 } 702 703 s.localRootCA = &updatedRootCA 704 s.externalCAPool = updatedRootCA.Pool 705 externalCACert := rCA.CACert 706 if rCA.RootRotation != nil { 707 externalCACert = rCA.RootRotation.CACert 708 // the external CA has to trust the new CA cert 709 s.externalCAPool = x509.NewCertPool() 710 s.externalCAPool.AppendCertsFromPEM(rCA.CACert) 711 s.externalCAPool.AppendCertsFromPEM(rCA.RootRotation.CACert) 712 } 713 s.lastSeenExternalCAs = cluster.Spec.CAConfig.Copy().ExternalCAs 714 urls := filterExternalCAURLS(ctx, externalCACert, rCA.CACert, s.lastSeenExternalCAs) 715 // Replace the external CA with the relevant intermediates, URLS, and TLS config 716 s.externalCA = NewExternalCA(updatedRootCA.Intermediates, 717 NewExternalCATLSConfig(s.securityConfig.ClientTLSCreds.Config().Certificates, s.externalCAPool), urls...) 718 719 // only update the server cache if we've successfully updated the root CA 720 log.G(ctx).Debugf("Root CA %s successfully", setOrUpdate) 721 s.lastSeenClusterRootCA = rCA 722 } else if externalCAChanged { 723 // we want to update only if the external CA URLS have changed, since if the root CA has changed we already 724 // run similar logic 725 if !firstSeenCluster { 726 log.G(ctx).Debug("Updating security config external CA URLs due to change in cluster spec's list of external CAs") 727 } 728 wantedExternalCACert := rCA.CACert // we want to only add external CA URLs that use this cert 729 if rCA.RootRotation != nil { 730 // we're rotating to a new root, so we only want external CAs with the new root cert 731 wantedExternalCACert = rCA.RootRotation.CACert 732 } 733 // Update our external CA with the list of External CA URLs from the new cluster state 734 s.lastSeenExternalCAs = cluster.Spec.CAConfig.Copy().ExternalCAs 735 urls := filterExternalCAURLS(ctx, wantedExternalCACert, rCA.CACert, s.lastSeenExternalCAs) 736 s.externalCA.UpdateURLs(urls...) 737 } 738 return nil 739 } 740 741 // evaluateAndSignNodeCert implements the logic of which certificates to sign 742 func (s *Server) evaluateAndSignNodeCert(ctx context.Context, node *api.Node) error { 743 // If the desired membership and actual state are in sync, there's 744 // nothing to do. 745 certState := node.Certificate.Status.State 746 if node.Spec.Membership == api.NodeMembershipAccepted && 747 (certState == api.IssuanceStateIssued || certState == api.IssuanceStateRotate) { 748 return nil 749 } 750 751 // If the certificate state is renew, then it is a server-sided accepted cert (cert renewals) 752 if certState == api.IssuanceStateRenew { 753 return s.signNodeCert(ctx, node) 754 } 755 756 // Sign this certificate if a user explicitly changed it to Accepted, and 757 // the certificate is in pending state 758 if node.Spec.Membership == api.NodeMembershipAccepted && certState == api.IssuanceStatePending { 759 return s.signNodeCert(ctx, node) 760 } 761 762 return nil 763 } 764 765 // signNodeCert does the bulk of the work for signing a certificate 766 func (s *Server) signNodeCert(ctx context.Context, node *api.Node) error { 767 s.signingMu.Lock() 768 rootCA := s.localRootCA 769 externalCA := s.externalCA 770 s.signingMu.Unlock() 771 772 node = node.Copy() 773 nodeID := node.ID 774 // Convert the role from proto format 775 role, err := ParseRole(node.Certificate.Role) 776 if err != nil { 777 log.G(ctx).WithFields(logrus.Fields{ 778 "node.id": node.ID, 779 "method": "(*Server).signNodeCert", 780 }).WithError(err).Errorf("failed to parse role") 781 return errors.New("failed to parse role") 782 } 783 784 s.pending[node.ID] = node 785 786 // Attempt to sign the CSR 787 var ( 788 rawCSR = node.Certificate.CSR 789 cn = node.Certificate.CN 790 ou = role 791 org = s.clusterID 792 ) 793 794 // Try using the external CA first. 795 cert, err := externalCA.Sign(ctx, PrepareCSR(rawCSR, cn, ou, org)) 796 if err == ErrNoExternalCAURLs { 797 // No external CA servers configured. Try using the local CA. 798 cert, err = rootCA.ParseValidateAndSignCSR(rawCSR, cn, ou, org) 799 } 800 801 if err != nil { 802 log.G(ctx).WithFields(logrus.Fields{ 803 "node.id": node.ID, 804 "method": "(*Server).signNodeCert", 805 }).WithError(err).Errorf("failed to sign CSR") 806 807 // If the current state is already Failed, no need to change it 808 if node.Certificate.Status.State == api.IssuanceStateFailed { 809 delete(s.pending, node.ID) 810 return errors.New("failed to sign CSR") 811 } 812 813 if _, ok := err.(recoverableErr); ok { 814 // Return without changing the state of the certificate. We may 815 // retry signing it in the future. 816 return errors.New("failed to sign CSR") 817 } 818 819 // We failed to sign this CSR, change the state to FAILED 820 err = s.store.Update(func(tx store.Tx) error { 821 node := store.GetNode(tx, nodeID) 822 if node == nil { 823 return errors.Errorf("node %s not found", nodeID) 824 } 825 826 node.Certificate.Status = api.IssuanceStatus{ 827 State: api.IssuanceStateFailed, 828 Err: err.Error(), 829 } 830 831 return store.UpdateNode(tx, node) 832 }) 833 if err != nil { 834 log.G(ctx).WithFields(logrus.Fields{ 835 "node.id": nodeID, 836 "method": "(*Server).signNodeCert", 837 }).WithError(err).Errorf("transaction failed when setting state to FAILED") 838 } 839 840 delete(s.pending, node.ID) 841 return errors.New("failed to sign CSR") 842 } 843 844 // We were able to successfully sign the new CSR. Let's try to update the nodeStore 845 for { 846 err = s.store.Update(func(tx store.Tx) error { 847 node.Certificate.Certificate = cert 848 node.Certificate.Status = api.IssuanceStatus{ 849 State: api.IssuanceStateIssued, 850 } 851 852 err := store.UpdateNode(tx, node) 853 if err != nil { 854 node = store.GetNode(tx, nodeID) 855 if node == nil { 856 err = errors.Errorf("node %s does not exist", nodeID) 857 } 858 } 859 return err 860 }) 861 if err == nil { 862 log.G(ctx).WithFields(logrus.Fields{ 863 "node.id": node.ID, 864 "node.role": node.Certificate.Role, 865 "method": "(*Server).signNodeCert", 866 }).Debugf("certificate issued") 867 delete(s.pending, node.ID) 868 break 869 } 870 if err == store.ErrSequenceConflict { 871 continue 872 } 873 874 log.G(ctx).WithFields(logrus.Fields{ 875 "node.id": nodeID, 876 "method": "(*Server).signNodeCert", 877 }).WithError(err).Errorf("transaction failed") 878 return errors.New("transaction failed") 879 } 880 return nil 881 } 882 883 // reconcileNodeCertificates is a helper method that calls evaluateAndSignNodeCert on all the 884 // nodes. 885 func (s *Server) reconcileNodeCertificates(ctx context.Context, nodes []*api.Node) error { 886 for _, node := range nodes { 887 s.evaluateAndSignNodeCert(ctx, node) 888 } 889 890 return nil 891 } 892 893 // A successfully issued certificate and a failed certificate are our current final states 894 func isFinalState(status api.IssuanceStatus) bool { 895 if status.State == api.IssuanceStateIssued || status.State == api.IssuanceStateFailed || 896 status.State == api.IssuanceStateRotate { 897 return true 898 } 899 900 return false 901 } 902 903 // RootCAFromAPI creates a RootCA object from an api.RootCA object 904 func RootCAFromAPI(ctx context.Context, apiRootCA *api.RootCA, expiry time.Duration) (RootCA, error) { 905 var intermediates []byte 906 signingCert := apiRootCA.CACert 907 signingKey := apiRootCA.CAKey 908 if apiRootCA.RootRotation != nil { 909 signingCert = apiRootCA.RootRotation.CrossSignedCACert 910 signingKey = apiRootCA.RootRotation.CAKey 911 intermediates = apiRootCA.RootRotation.CrossSignedCACert 912 } 913 if signingKey == nil { 914 signingCert = nil 915 } 916 return NewRootCA(apiRootCA.CACert, signingCert, signingKey, expiry, intermediates) 917 }