github.phpd.cn/hashicorp/consul@v1.4.5/agent/consul/connect_ca_endpoint.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"reflect"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/hashicorp/consul/lib/semaphore"
    13  
    14  	"golang.org/x/time/rate"
    15  
    16  	"github.com/hashicorp/consul/acl"
    17  	"github.com/hashicorp/consul/agent/connect"
    18  	"github.com/hashicorp/consul/agent/consul/state"
    19  	"github.com/hashicorp/consul/agent/structs"
    20  	"github.com/hashicorp/go-memdb"
    21  )
    22  
    23  var (
    24  	// Err strings. net/rpc doesn't have a way to transport typed/rich errors so
    25  	// we currently rely on sniffing the error string in a few cases where we need
    26  	// to change client behavior. These are the canonical error strings to use.
    27  	// Note though that client code can't use `err == consul.Err*` directly since
    28  	// the error returned by RPC will be a plain error.errorString created by
    29  	// net/rpc client so will not be the same _instance_ that this package
    30  	// variable points to. Clients need to compare using `err.Error() ==
    31  	// consul.ErrRateLimited.Error()` which is very sad. Short of replacing our
    32  	// RPC mechanism it's hard to know how to make that much better though.
    33  	ErrConnectNotEnabled = errors.New("Connect must be enabled in order to use this endpoint")
    34  	ErrRateLimited       = errors.New("Rate limit reached, try again later")
    35  )
    36  
    37  const (
    38  	// csrLimitWait is the maximum time we'll wait for a slot when CSR concurrency
    39  	// limiting or rate limiting is occurring. It's intentionally short so small
    40  	// batches of requests can be accommodated when server has capacity (assuming
    41  	// signing one cert takes much less than this) but failing requests fast when
    42  	// a thundering herd comes along.
    43  	csrLimitWait = 500 * time.Millisecond
    44  )
    45  
    46  // ConnectCA manages the Connect CA.
    47  type ConnectCA struct {
    48  	// srv is a pointer back to the server.
    49  	srv *Server
    50  
    51  	// csrRateLimiter limits the rate of signing new certs if configured. Lazily
    52  	// initialized from current config to support dynamic changes.
    53  	// csrRateLimiterMu must be held while dereferencing the pointer or storing a
    54  	// new one, but methods can be called on the limiter object outside of the
    55  	// locked section. This is done only in the getCSRRateLimiterWithLimit method.
    56  	csrRateLimiter   *rate.Limiter
    57  	csrRateLimiterMu sync.RWMutex
    58  
    59  	// csrConcurrencyLimiter is a dynamically resizable semaphore used to limit
    60  	// Sign RPC concurrency if configured. The zero value is usable as soon as
    61  	// SetSize is called which we do dynamically in the RPC handler to avoid
    62  	// having to hook elaborate synchronization mechanisms through the CA config
    63  	// endpoint and config reload etc.
    64  	csrConcurrencyLimiter semaphore.Dynamic
    65  }
    66  
    67  // getCSRRateLimiterWithLimit returns a rate.Limiter with the desired limit set.
    68  // It uses the shared server-wide limiter unless the limit has been changed in
    69  // config or the limiter has not been setup yet in which case it just-in-time
    70  // configures the new limiter. We assume that limit changes are relatively rare
    71  // and that all callers (there is currently only one) use the same config value
    72  // as the limit. There might be some flapping if there are multiple concurrent
    73  // requests in flight at the time the config changes where A sees the new value
    74  // and updates, B sees the old but then gets this lock second and changes back.
    75  // Eventually though and very soon (once all current RPCs are complete) we are
    76  // guaranteed to have the correct limit set by the next RPC that comes in so I
    77  // assume this is fine. If we observe strange behavior because of it, we could
    78  // add hysteresis that prevents changes too soon after a previous change but
    79  // that seems unnecessary for now.
    80  func (s *ConnectCA) getCSRRateLimiterWithLimit(limit rate.Limit) *rate.Limiter {
    81  	s.csrRateLimiterMu.RLock()
    82  	lim := s.csrRateLimiter
    83  	s.csrRateLimiterMu.RUnlock()
    84  
    85  	// If there is a current limiter with the same limit, return it. This should
    86  	// be the common case.
    87  	if lim != nil && lim.Limit() == limit {
    88  		return lim
    89  	}
    90  
    91  	// Need to change limiter, get write lock
    92  	s.csrRateLimiterMu.Lock()
    93  	defer s.csrRateLimiterMu.Unlock()
    94  	// No limiter yet, or limit changed in CA config, reconfigure a new limiter.
    95  	// We use burst of 1 for a hard limit. Note that either bursting or waiting is
    96  	// necessary to get expected behavior in fact of random arrival times, but we
    97  	// don't need both and we use Wait with a small delay to smooth noise. See
    98  	// https://github.com/banks/sim-rate-limit-backoff/blob/master/README.md.
    99  	s.csrRateLimiter = rate.NewLimiter(limit, 1)
   100  	return s.csrRateLimiter
   101  }
   102  
   103  // ConfigurationGet returns the configuration for the CA.
   104  func (s *ConnectCA) ConfigurationGet(
   105  	args *structs.DCSpecificRequest,
   106  	reply *structs.CAConfiguration) error {
   107  	// Exit early if Connect hasn't been enabled.
   108  	if !s.srv.config.ConnectEnabled {
   109  		return ErrConnectNotEnabled
   110  	}
   111  
   112  	if done, err := s.srv.forward("ConnectCA.ConfigurationGet", args, args, reply); done {
   113  		return err
   114  	}
   115  
   116  	// This action requires operator read access.
   117  	rule, err := s.srv.ResolveToken(args.Token)
   118  	if err != nil {
   119  		return err
   120  	}
   121  	if rule != nil && !rule.OperatorRead() {
   122  		return acl.ErrPermissionDenied
   123  	}
   124  
   125  	state := s.srv.fsm.State()
   126  	_, config, err := state.CAConfig()
   127  	if err != nil {
   128  		return err
   129  	}
   130  	*reply = *config
   131  
   132  	return nil
   133  }
   134  
   135  // ConfigurationSet updates the configuration for the CA.
   136  func (s *ConnectCA) ConfigurationSet(
   137  	args *structs.CARequest,
   138  	reply *interface{}) error {
   139  	// Exit early if Connect hasn't been enabled.
   140  	if !s.srv.config.ConnectEnabled {
   141  		return ErrConnectNotEnabled
   142  	}
   143  
   144  	if done, err := s.srv.forward("ConnectCA.ConfigurationSet", args, args, reply); done {
   145  		return err
   146  	}
   147  
   148  	// This action requires operator write access.
   149  	rule, err := s.srv.ResolveToken(args.Token)
   150  	if err != nil {
   151  		return err
   152  	}
   153  	if rule != nil && !rule.OperatorWrite() {
   154  		return acl.ErrPermissionDenied
   155  	}
   156  
   157  	// Exit early if it's a no-op change
   158  	state := s.srv.fsm.State()
   159  	confIdx, config, err := state.CAConfig()
   160  	if err != nil {
   161  		return err
   162  	}
   163  
   164  	// Don't allow users to change the ClusterID.
   165  	args.Config.ClusterID = config.ClusterID
   166  	if args.Config.Provider == config.Provider && reflect.DeepEqual(args.Config.Config, config.Config) {
   167  		return nil
   168  	}
   169  
   170  	// Create a new instance of the provider described by the config
   171  	// and get the current active root CA. This acts as a good validation
   172  	// of the config and makes sure the provider is functioning correctly
   173  	// before we commit any changes to Raft.
   174  	newProvider, err := s.srv.createCAProvider(args.Config)
   175  	if err != nil {
   176  		return fmt.Errorf("could not initialize provider: %v", err)
   177  	}
   178  	if err := newProvider.Configure(args.Config.ClusterID, true, args.Config.Config); err != nil {
   179  		return fmt.Errorf("error configuring provider: %v", err)
   180  	}
   181  	if err := newProvider.GenerateRoot(); err != nil {
   182  		return fmt.Errorf("error generating CA root certificate: %v", err)
   183  	}
   184  
   185  	newRootPEM, err := newProvider.ActiveRoot()
   186  	if err != nil {
   187  		return err
   188  	}
   189  
   190  	newActiveRoot, err := parseCARoot(newRootPEM, args.Config.Provider, args.Config.ClusterID)
   191  	if err != nil {
   192  		return err
   193  	}
   194  
   195  	// Compare the new provider's root CA ID to the current one. If they
   196  	// match, just update the existing provider with the new config.
   197  	// If they don't match, begin the root rotation process.
   198  	_, root, err := state.CARootActive(nil)
   199  	if err != nil {
   200  		return err
   201  	}
   202  
   203  	// If the root didn't change or if this is a secondary DC, just update the
   204  	// config and return.
   205  	if (s.srv.config.Datacenter != s.srv.config.PrimaryDatacenter) ||
   206  		root != nil && root.ID == newActiveRoot.ID {
   207  		args.Op = structs.CAOpSetConfig
   208  		resp, err := s.srv.raftApply(structs.ConnectCARequestType, args)
   209  		if err != nil {
   210  			return err
   211  		}
   212  		if respErr, ok := resp.(error); ok {
   213  			return respErr
   214  		}
   215  
   216  		// If the config has been committed, update the local provider instance
   217  		s.srv.setCAProvider(newProvider, newActiveRoot)
   218  
   219  		s.srv.logger.Printf("[INFO] connect: CA provider config updated")
   220  
   221  		return nil
   222  	}
   223  
   224  	// At this point, we know the config change has trigged a root rotation,
   225  	// either by swapping the provider type or changing the provider's config
   226  	// to use a different root certificate.
   227  
   228  	// If it's a config change that would trigger a rotation (different provider/root):
   229  	// 1. Get the root from the new provider.
   230  	// 2. Call CrossSignCA on the old provider to sign the new root with the old one to
   231  	// get a cross-signed certificate.
   232  	// 3. Take the active root for the new provider and append the intermediate from step 2
   233  	// to its list of intermediates.
   234  	newRoot, err := connect.ParseCert(newRootPEM)
   235  	if err != nil {
   236  		return err
   237  	}
   238  
   239  	// Have the old provider cross-sign the new intermediate
   240  	oldProvider, _ := s.srv.getCAProvider()
   241  	if oldProvider == nil {
   242  		return fmt.Errorf("internal error: CA provider is nil")
   243  	}
   244  	xcCert, err := oldProvider.CrossSignCA(newRoot)
   245  	if err != nil {
   246  		return err
   247  	}
   248  
   249  	// Add the cross signed cert to the new root's intermediates.
   250  	newActiveRoot.IntermediateCerts = []string{xcCert}
   251  	intermediate, err := newProvider.GenerateIntermediate()
   252  	if err != nil {
   253  		return err
   254  	}
   255  	if intermediate != newRootPEM {
   256  		newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediate)
   257  	}
   258  
   259  	// Update the roots and CA config in the state store at the same time
   260  	idx, roots, err := state.CARoots(nil)
   261  	if err != nil {
   262  		return err
   263  	}
   264  
   265  	var newRoots structs.CARoots
   266  	for _, r := range roots {
   267  		newRoot := *r
   268  		if newRoot.Active {
   269  			newRoot.Active = false
   270  			newRoot.RotatedOutAt = time.Now()
   271  		}
   272  		newRoots = append(newRoots, &newRoot)
   273  	}
   274  	newRoots = append(newRoots, newActiveRoot)
   275  
   276  	args.Op = structs.CAOpSetRootsAndConfig
   277  	args.Index = idx
   278  	args.Config.ModifyIndex = confIdx
   279  	args.Roots = newRoots
   280  	resp, err := s.srv.raftApply(structs.ConnectCARequestType, args)
   281  	if err != nil {
   282  		return err
   283  	}
   284  	if respErr, ok := resp.(error); ok {
   285  		return respErr
   286  	}
   287  	if respOk, ok := resp.(bool); ok && !respOk {
   288  		return fmt.Errorf("could not atomically update roots and config")
   289  	}
   290  
   291  	// If the config has been committed, update the local provider instance
   292  	// and call teardown on the old provider
   293  	s.srv.setCAProvider(newProvider, newActiveRoot)
   294  
   295  	if err := oldProvider.Cleanup(); err != nil {
   296  		s.srv.logger.Printf("[WARN] connect: failed to clean up old provider %q", config.Provider)
   297  	}
   298  
   299  	s.srv.logger.Printf("[INFO] connect: CA rotated to new root under provider %q", args.Config.Provider)
   300  
   301  	return nil
   302  }
   303  
   304  // Roots returns the currently trusted root certificates.
   305  func (s *ConnectCA) Roots(
   306  	args *structs.DCSpecificRequest,
   307  	reply *structs.IndexedCARoots) error {
   308  	// Forward if necessary
   309  	if done, err := s.srv.forward("ConnectCA.Roots", args, args, reply); done {
   310  		return err
   311  	}
   312  
   313  	// Exit early if Connect hasn't been enabled.
   314  	if !s.srv.config.ConnectEnabled {
   315  		return ErrConnectNotEnabled
   316  	}
   317  
   318  	// Load the ClusterID to generate TrustDomain. We do this outside the loop
   319  	// since by definition this value should be immutable once set for lifetime of
   320  	// the cluster so we don't need to look it up more than once. We also don't
   321  	// have to worry about non-atomicity between the config fetch transaction and
   322  	// the CARoots transaction below since this field must remain immutable. Do
   323  	// not re-use this state/config for other logic that might care about changes
   324  	// of config during the blocking query below.
   325  	{
   326  		state := s.srv.fsm.State()
   327  		_, config, err := state.CAConfig()
   328  		if err != nil {
   329  			return err
   330  		}
   331  
   332  		// Check CA is actually bootstrapped...
   333  		if config != nil {
   334  			// Build TrustDomain based on the ClusterID stored.
   335  			signingID := connect.SpiffeIDSigningForCluster(config)
   336  			if signingID == nil {
   337  				// If CA is bootstrapped at all then this should never happen but be
   338  				// defensive.
   339  				return errors.New("no cluster trust domain setup")
   340  			}
   341  			reply.TrustDomain = signingID.Host()
   342  		}
   343  	}
   344  
   345  	return s.srv.blockingQuery(
   346  		&args.QueryOptions, &reply.QueryMeta,
   347  		func(ws memdb.WatchSet, state *state.Store) error {
   348  			index, roots, err := state.CARoots(ws)
   349  			if err != nil {
   350  				return err
   351  			}
   352  
   353  			reply.Index, reply.Roots = index, roots
   354  			if reply.Roots == nil {
   355  				reply.Roots = make(structs.CARoots, 0)
   356  			}
   357  
   358  			// The API response must NEVER contain the secret information
   359  			// such as keys and so on. We use a whitelist below to copy the
   360  			// specific fields we want to expose.
   361  			for i, r := range reply.Roots {
   362  				// IMPORTANT: r must NEVER be modified, since it is a pointer
   363  				// directly to the structure in the memdb store.
   364  
   365  				reply.Roots[i] = &structs.CARoot{
   366  					ID:                  r.ID,
   367  					Name:                r.Name,
   368  					SerialNumber:        r.SerialNumber,
   369  					SigningKeyID:        r.SigningKeyID,
   370  					ExternalTrustDomain: r.ExternalTrustDomain,
   371  					NotBefore:           r.NotBefore,
   372  					NotAfter:            r.NotAfter,
   373  					RootCert:            r.RootCert,
   374  					IntermediateCerts:   r.IntermediateCerts,
   375  					RaftIndex:           r.RaftIndex,
   376  					Active:              r.Active,
   377  				}
   378  
   379  				if r.Active {
   380  					reply.ActiveRootID = r.ID
   381  				}
   382  			}
   383  
   384  			return nil
   385  		},
   386  	)
   387  }
   388  
   389  // Sign signs a certificate for a service.
   390  func (s *ConnectCA) Sign(
   391  	args *structs.CASignRequest,
   392  	reply *structs.IssuedCert) error {
   393  	// Exit early if Connect hasn't been enabled.
   394  	if !s.srv.config.ConnectEnabled {
   395  		return ErrConnectNotEnabled
   396  	}
   397  
   398  	if done, err := s.srv.forward("ConnectCA.Sign", args, args, reply); done {
   399  		return err
   400  	}
   401  
   402  	// Parse the CSR
   403  	csr, err := connect.ParseCSR(args.CSR)
   404  	if err != nil {
   405  		return err
   406  	}
   407  
   408  	// Parse the SPIFFE ID
   409  	spiffeID, err := connect.ParseCertURI(csr.URIs[0])
   410  	if err != nil {
   411  		return err
   412  	}
   413  	serviceID, ok := spiffeID.(*connect.SpiffeIDService)
   414  	if !ok {
   415  		return fmt.Errorf("SPIFFE ID in CSR must be a service ID")
   416  	}
   417  
   418  	provider, caRoot := s.srv.getCAProvider()
   419  	if provider == nil {
   420  		return fmt.Errorf("internal error: CA provider is nil")
   421  	}
   422  
   423  	// Verify that the CSR entity is in the cluster's trust domain
   424  	state := s.srv.fsm.State()
   425  	_, config, err := state.CAConfig()
   426  	if err != nil {
   427  		return err
   428  	}
   429  	signingID := connect.SpiffeIDSigningForCluster(config)
   430  	if !signingID.CanSign(serviceID) {
   431  		return fmt.Errorf("SPIFFE ID in CSR from a different trust domain: %s, "+
   432  			"we are %s", serviceID.Host, signingID.Host())
   433  	}
   434  
   435  	// Verify that the ACL token provided has permission to act as this service
   436  	rule, err := s.srv.ResolveToken(args.Token)
   437  	if err != nil {
   438  		return err
   439  	}
   440  	if rule != nil && !rule.ServiceWrite(serviceID.Service, nil) {
   441  		return acl.ErrPermissionDenied
   442  	}
   443  
   444  	// Verify that the DC in the service URI matches us. We might relax this
   445  	// requirement later but being restrictive for now is safer.
   446  	if serviceID.Datacenter != s.srv.config.Datacenter {
   447  		return fmt.Errorf("SPIFFE ID in CSR from a different datacenter: %s, "+
   448  			"we are %s", serviceID.Datacenter, s.srv.config.Datacenter)
   449  	}
   450  
   451  	commonCfg, err := config.GetCommonConfig()
   452  	if err != nil {
   453  		return err
   454  	}
   455  	if commonCfg.CSRMaxPerSecond > 0 {
   456  		lim := s.getCSRRateLimiterWithLimit(rate.Limit(commonCfg.CSRMaxPerSecond))
   457  		// Wait up to the small threshold we allow for a token.
   458  		ctx, cancel := context.WithTimeout(context.Background(), csrLimitWait)
   459  		defer cancel()
   460  		if lim.Wait(ctx) != nil {
   461  			return ErrRateLimited
   462  		}
   463  	} else if commonCfg.CSRMaxConcurrent > 0 {
   464  		s.csrConcurrencyLimiter.SetSize(int64(commonCfg.CSRMaxConcurrent))
   465  		ctx, cancel := context.WithTimeout(context.Background(), csrLimitWait)
   466  		defer cancel()
   467  		if err := s.csrConcurrencyLimiter.Acquire(ctx); err != nil {
   468  			return ErrRateLimited
   469  		}
   470  		defer s.csrConcurrencyLimiter.Release()
   471  	}
   472  
   473  	// All seems to be in order, actually sign it.
   474  	pem, err := provider.Sign(csr)
   475  	if err != nil {
   476  		return err
   477  	}
   478  
   479  	// Append any intermediates needed by this root.
   480  	for _, p := range caRoot.IntermediateCerts {
   481  		pem = strings.TrimSpace(pem) + "\n" + p
   482  	}
   483  
   484  	// Append our local CA's intermediate if there is one.
   485  	inter, err := provider.ActiveIntermediate()
   486  	if err != nil {
   487  		return err
   488  	}
   489  	root, err := provider.ActiveRoot()
   490  	if err != nil {
   491  		return err
   492  	}
   493  
   494  	if inter != root {
   495  		pem = strings.TrimSpace(pem) + "\n" + inter
   496  	}
   497  
   498  	// TODO(banks): when we implement IssuedCerts table we can use the insert to
   499  	// that as the raft index to return in response.
   500  	//
   501  	// UPDATE(mkeeler): The original implementation relied on updating the CAConfig
   502  	// and using its index as the ModifyIndex for certs. This was buggy. The long
   503  	// term goal is still to insert some metadata into raft about the certificates
   504  	// and use that raft index for the ModifyIndex. This is a partial step in that
   505  	// direction except that we only are setting an index and not storing the
   506  	// metadata.
   507  	req := structs.CALeafRequest{
   508  		Op:           structs.CALeafOpIncrementIndex,
   509  		Datacenter:   s.srv.config.Datacenter,
   510  		WriteRequest: structs.WriteRequest{Token: args.Token},
   511  	}
   512  
   513  	resp, err := s.srv.raftApply(structs.ConnectCALeafRequestType|structs.IgnoreUnknownTypeFlag, &req)
   514  	if err != nil {
   515  		return err
   516  	}
   517  
   518  	modIdx, ok := resp.(uint64)
   519  	if !ok {
   520  		return fmt.Errorf("Invalid response from updating the leaf cert index")
   521  	}
   522  
   523  	cert, err := connect.ParseCert(pem)
   524  	if err != nil {
   525  		return err
   526  	}
   527  
   528  	// Set the response
   529  	*reply = structs.IssuedCert{
   530  		SerialNumber: connect.HexString(cert.SerialNumber.Bytes()),
   531  		CertPEM:      pem,
   532  		Service:      serviceID.Service,
   533  		ServiceURI:   cert.URIs[0].String(),
   534  		ValidAfter:   cert.NotBefore,
   535  		ValidBefore:  cert.NotAfter,
   536  		RaftIndex: structs.RaftIndex{
   537  			ModifyIndex: modIdx,
   538  			CreateIndex: modIdx,
   539  		},
   540  	}
   541  
   542  	return nil
   543  }