github.phpd.cn/hashicorp/consul@v1.4.5/agent/consul/connect_ca_endpoint.go (about) 1 package consul 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "reflect" 8 "strings" 9 "sync" 10 "time" 11 12 "github.com/hashicorp/consul/lib/semaphore" 13 14 "golang.org/x/time/rate" 15 16 "github.com/hashicorp/consul/acl" 17 "github.com/hashicorp/consul/agent/connect" 18 "github.com/hashicorp/consul/agent/consul/state" 19 "github.com/hashicorp/consul/agent/structs" 20 "github.com/hashicorp/go-memdb" 21 ) 22 23 var ( 24 // Err strings. net/rpc doesn't have a way to transport typed/rich errors so 25 // we currently rely on sniffing the error string in a few cases where we need 26 // to change client behavior. These are the canonical error strings to use. 27 // Note though that client code can't use `err == consul.Err*` directly since 28 // the error returned by RPC will be a plain error.errorString created by 29 // net/rpc client so will not be the same _instance_ that this package 30 // variable points to. Clients need to compare using `err.Error() == 31 // consul.ErrRateLimited.Error()` which is very sad. Short of replacing our 32 // RPC mechanism it's hard to know how to make that much better though. 33 ErrConnectNotEnabled = errors.New("Connect must be enabled in order to use this endpoint") 34 ErrRateLimited = errors.New("Rate limit reached, try again later") 35 ) 36 37 const ( 38 // csrLimitWait is the maximum time we'll wait for a slot when CSR concurrency 39 // limiting or rate limiting is occurring. It's intentionally short so small 40 // batches of requests can be accommodated when server has capacity (assuming 41 // signing one cert takes much less than this) but failing requests fast when 42 // a thundering herd comes along. 43 csrLimitWait = 500 * time.Millisecond 44 ) 45 46 // ConnectCA manages the Connect CA. 47 type ConnectCA struct { 48 // srv is a pointer back to the server. 49 srv *Server 50 51 // csrRateLimiter limits the rate of signing new certs if configured. Lazily 52 // initialized from current config to support dynamic changes. 53 // csrRateLimiterMu must be held while dereferencing the pointer or storing a 54 // new one, but methods can be called on the limiter object outside of the 55 // locked section. This is done only in the getCSRRateLimiterWithLimit method. 56 csrRateLimiter *rate.Limiter 57 csrRateLimiterMu sync.RWMutex 58 59 // csrConcurrencyLimiter is a dynamically resizable semaphore used to limit 60 // Sign RPC concurrency if configured. The zero value is usable as soon as 61 // SetSize is called which we do dynamically in the RPC handler to avoid 62 // having to hook elaborate synchronization mechanisms through the CA config 63 // endpoint and config reload etc. 64 csrConcurrencyLimiter semaphore.Dynamic 65 } 66 67 // getCSRRateLimiterWithLimit returns a rate.Limiter with the desired limit set. 68 // It uses the shared server-wide limiter unless the limit has been changed in 69 // config or the limiter has not been setup yet in which case it just-in-time 70 // configures the new limiter. We assume that limit changes are relatively rare 71 // and that all callers (there is currently only one) use the same config value 72 // as the limit. There might be some flapping if there are multiple concurrent 73 // requests in flight at the time the config changes where A sees the new value 74 // and updates, B sees the old but then gets this lock second and changes back. 75 // Eventually though and very soon (once all current RPCs are complete) we are 76 // guaranteed to have the correct limit set by the next RPC that comes in so I 77 // assume this is fine. If we observe strange behavior because of it, we could 78 // add hysteresis that prevents changes too soon after a previous change but 79 // that seems unnecessary for now. 80 func (s *ConnectCA) getCSRRateLimiterWithLimit(limit rate.Limit) *rate.Limiter { 81 s.csrRateLimiterMu.RLock() 82 lim := s.csrRateLimiter 83 s.csrRateLimiterMu.RUnlock() 84 85 // If there is a current limiter with the same limit, return it. This should 86 // be the common case. 87 if lim != nil && lim.Limit() == limit { 88 return lim 89 } 90 91 // Need to change limiter, get write lock 92 s.csrRateLimiterMu.Lock() 93 defer s.csrRateLimiterMu.Unlock() 94 // No limiter yet, or limit changed in CA config, reconfigure a new limiter. 95 // We use burst of 1 for a hard limit. Note that either bursting or waiting is 96 // necessary to get expected behavior in fact of random arrival times, but we 97 // don't need both and we use Wait with a small delay to smooth noise. See 98 // https://github.com/banks/sim-rate-limit-backoff/blob/master/README.md. 99 s.csrRateLimiter = rate.NewLimiter(limit, 1) 100 return s.csrRateLimiter 101 } 102 103 // ConfigurationGet returns the configuration for the CA. 104 func (s *ConnectCA) ConfigurationGet( 105 args *structs.DCSpecificRequest, 106 reply *structs.CAConfiguration) error { 107 // Exit early if Connect hasn't been enabled. 108 if !s.srv.config.ConnectEnabled { 109 return ErrConnectNotEnabled 110 } 111 112 if done, err := s.srv.forward("ConnectCA.ConfigurationGet", args, args, reply); done { 113 return err 114 } 115 116 // This action requires operator read access. 117 rule, err := s.srv.ResolveToken(args.Token) 118 if err != nil { 119 return err 120 } 121 if rule != nil && !rule.OperatorRead() { 122 return acl.ErrPermissionDenied 123 } 124 125 state := s.srv.fsm.State() 126 _, config, err := state.CAConfig() 127 if err != nil { 128 return err 129 } 130 *reply = *config 131 132 return nil 133 } 134 135 // ConfigurationSet updates the configuration for the CA. 136 func (s *ConnectCA) ConfigurationSet( 137 args *structs.CARequest, 138 reply *interface{}) error { 139 // Exit early if Connect hasn't been enabled. 140 if !s.srv.config.ConnectEnabled { 141 return ErrConnectNotEnabled 142 } 143 144 if done, err := s.srv.forward("ConnectCA.ConfigurationSet", args, args, reply); done { 145 return err 146 } 147 148 // This action requires operator write access. 149 rule, err := s.srv.ResolveToken(args.Token) 150 if err != nil { 151 return err 152 } 153 if rule != nil && !rule.OperatorWrite() { 154 return acl.ErrPermissionDenied 155 } 156 157 // Exit early if it's a no-op change 158 state := s.srv.fsm.State() 159 confIdx, config, err := state.CAConfig() 160 if err != nil { 161 return err 162 } 163 164 // Don't allow users to change the ClusterID. 165 args.Config.ClusterID = config.ClusterID 166 if args.Config.Provider == config.Provider && reflect.DeepEqual(args.Config.Config, config.Config) { 167 return nil 168 } 169 170 // Create a new instance of the provider described by the config 171 // and get the current active root CA. This acts as a good validation 172 // of the config and makes sure the provider is functioning correctly 173 // before we commit any changes to Raft. 174 newProvider, err := s.srv.createCAProvider(args.Config) 175 if err != nil { 176 return fmt.Errorf("could not initialize provider: %v", err) 177 } 178 if err := newProvider.Configure(args.Config.ClusterID, true, args.Config.Config); err != nil { 179 return fmt.Errorf("error configuring provider: %v", err) 180 } 181 if err := newProvider.GenerateRoot(); err != nil { 182 return fmt.Errorf("error generating CA root certificate: %v", err) 183 } 184 185 newRootPEM, err := newProvider.ActiveRoot() 186 if err != nil { 187 return err 188 } 189 190 newActiveRoot, err := parseCARoot(newRootPEM, args.Config.Provider, args.Config.ClusterID) 191 if err != nil { 192 return err 193 } 194 195 // Compare the new provider's root CA ID to the current one. If they 196 // match, just update the existing provider with the new config. 197 // If they don't match, begin the root rotation process. 198 _, root, err := state.CARootActive(nil) 199 if err != nil { 200 return err 201 } 202 203 // If the root didn't change or if this is a secondary DC, just update the 204 // config and return. 205 if (s.srv.config.Datacenter != s.srv.config.PrimaryDatacenter) || 206 root != nil && root.ID == newActiveRoot.ID { 207 args.Op = structs.CAOpSetConfig 208 resp, err := s.srv.raftApply(structs.ConnectCARequestType, args) 209 if err != nil { 210 return err 211 } 212 if respErr, ok := resp.(error); ok { 213 return respErr 214 } 215 216 // If the config has been committed, update the local provider instance 217 s.srv.setCAProvider(newProvider, newActiveRoot) 218 219 s.srv.logger.Printf("[INFO] connect: CA provider config updated") 220 221 return nil 222 } 223 224 // At this point, we know the config change has trigged a root rotation, 225 // either by swapping the provider type or changing the provider's config 226 // to use a different root certificate. 227 228 // If it's a config change that would trigger a rotation (different provider/root): 229 // 1. Get the root from the new provider. 230 // 2. Call CrossSignCA on the old provider to sign the new root with the old one to 231 // get a cross-signed certificate. 232 // 3. Take the active root for the new provider and append the intermediate from step 2 233 // to its list of intermediates. 234 newRoot, err := connect.ParseCert(newRootPEM) 235 if err != nil { 236 return err 237 } 238 239 // Have the old provider cross-sign the new intermediate 240 oldProvider, _ := s.srv.getCAProvider() 241 if oldProvider == nil { 242 return fmt.Errorf("internal error: CA provider is nil") 243 } 244 xcCert, err := oldProvider.CrossSignCA(newRoot) 245 if err != nil { 246 return err 247 } 248 249 // Add the cross signed cert to the new root's intermediates. 250 newActiveRoot.IntermediateCerts = []string{xcCert} 251 intermediate, err := newProvider.GenerateIntermediate() 252 if err != nil { 253 return err 254 } 255 if intermediate != newRootPEM { 256 newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediate) 257 } 258 259 // Update the roots and CA config in the state store at the same time 260 idx, roots, err := state.CARoots(nil) 261 if err != nil { 262 return err 263 } 264 265 var newRoots structs.CARoots 266 for _, r := range roots { 267 newRoot := *r 268 if newRoot.Active { 269 newRoot.Active = false 270 newRoot.RotatedOutAt = time.Now() 271 } 272 newRoots = append(newRoots, &newRoot) 273 } 274 newRoots = append(newRoots, newActiveRoot) 275 276 args.Op = structs.CAOpSetRootsAndConfig 277 args.Index = idx 278 args.Config.ModifyIndex = confIdx 279 args.Roots = newRoots 280 resp, err := s.srv.raftApply(structs.ConnectCARequestType, args) 281 if err != nil { 282 return err 283 } 284 if respErr, ok := resp.(error); ok { 285 return respErr 286 } 287 if respOk, ok := resp.(bool); ok && !respOk { 288 return fmt.Errorf("could not atomically update roots and config") 289 } 290 291 // If the config has been committed, update the local provider instance 292 // and call teardown on the old provider 293 s.srv.setCAProvider(newProvider, newActiveRoot) 294 295 if err := oldProvider.Cleanup(); err != nil { 296 s.srv.logger.Printf("[WARN] connect: failed to clean up old provider %q", config.Provider) 297 } 298 299 s.srv.logger.Printf("[INFO] connect: CA rotated to new root under provider %q", args.Config.Provider) 300 301 return nil 302 } 303 304 // Roots returns the currently trusted root certificates. 305 func (s *ConnectCA) Roots( 306 args *structs.DCSpecificRequest, 307 reply *structs.IndexedCARoots) error { 308 // Forward if necessary 309 if done, err := s.srv.forward("ConnectCA.Roots", args, args, reply); done { 310 return err 311 } 312 313 // Exit early if Connect hasn't been enabled. 314 if !s.srv.config.ConnectEnabled { 315 return ErrConnectNotEnabled 316 } 317 318 // Load the ClusterID to generate TrustDomain. We do this outside the loop 319 // since by definition this value should be immutable once set for lifetime of 320 // the cluster so we don't need to look it up more than once. We also don't 321 // have to worry about non-atomicity between the config fetch transaction and 322 // the CARoots transaction below since this field must remain immutable. Do 323 // not re-use this state/config for other logic that might care about changes 324 // of config during the blocking query below. 325 { 326 state := s.srv.fsm.State() 327 _, config, err := state.CAConfig() 328 if err != nil { 329 return err 330 } 331 332 // Check CA is actually bootstrapped... 333 if config != nil { 334 // Build TrustDomain based on the ClusterID stored. 335 signingID := connect.SpiffeIDSigningForCluster(config) 336 if signingID == nil { 337 // If CA is bootstrapped at all then this should never happen but be 338 // defensive. 339 return errors.New("no cluster trust domain setup") 340 } 341 reply.TrustDomain = signingID.Host() 342 } 343 } 344 345 return s.srv.blockingQuery( 346 &args.QueryOptions, &reply.QueryMeta, 347 func(ws memdb.WatchSet, state *state.Store) error { 348 index, roots, err := state.CARoots(ws) 349 if err != nil { 350 return err 351 } 352 353 reply.Index, reply.Roots = index, roots 354 if reply.Roots == nil { 355 reply.Roots = make(structs.CARoots, 0) 356 } 357 358 // The API response must NEVER contain the secret information 359 // such as keys and so on. We use a whitelist below to copy the 360 // specific fields we want to expose. 361 for i, r := range reply.Roots { 362 // IMPORTANT: r must NEVER be modified, since it is a pointer 363 // directly to the structure in the memdb store. 364 365 reply.Roots[i] = &structs.CARoot{ 366 ID: r.ID, 367 Name: r.Name, 368 SerialNumber: r.SerialNumber, 369 SigningKeyID: r.SigningKeyID, 370 ExternalTrustDomain: r.ExternalTrustDomain, 371 NotBefore: r.NotBefore, 372 NotAfter: r.NotAfter, 373 RootCert: r.RootCert, 374 IntermediateCerts: r.IntermediateCerts, 375 RaftIndex: r.RaftIndex, 376 Active: r.Active, 377 } 378 379 if r.Active { 380 reply.ActiveRootID = r.ID 381 } 382 } 383 384 return nil 385 }, 386 ) 387 } 388 389 // Sign signs a certificate for a service. 390 func (s *ConnectCA) Sign( 391 args *structs.CASignRequest, 392 reply *structs.IssuedCert) error { 393 // Exit early if Connect hasn't been enabled. 394 if !s.srv.config.ConnectEnabled { 395 return ErrConnectNotEnabled 396 } 397 398 if done, err := s.srv.forward("ConnectCA.Sign", args, args, reply); done { 399 return err 400 } 401 402 // Parse the CSR 403 csr, err := connect.ParseCSR(args.CSR) 404 if err != nil { 405 return err 406 } 407 408 // Parse the SPIFFE ID 409 spiffeID, err := connect.ParseCertURI(csr.URIs[0]) 410 if err != nil { 411 return err 412 } 413 serviceID, ok := spiffeID.(*connect.SpiffeIDService) 414 if !ok { 415 return fmt.Errorf("SPIFFE ID in CSR must be a service ID") 416 } 417 418 provider, caRoot := s.srv.getCAProvider() 419 if provider == nil { 420 return fmt.Errorf("internal error: CA provider is nil") 421 } 422 423 // Verify that the CSR entity is in the cluster's trust domain 424 state := s.srv.fsm.State() 425 _, config, err := state.CAConfig() 426 if err != nil { 427 return err 428 } 429 signingID := connect.SpiffeIDSigningForCluster(config) 430 if !signingID.CanSign(serviceID) { 431 return fmt.Errorf("SPIFFE ID in CSR from a different trust domain: %s, "+ 432 "we are %s", serviceID.Host, signingID.Host()) 433 } 434 435 // Verify that the ACL token provided has permission to act as this service 436 rule, err := s.srv.ResolveToken(args.Token) 437 if err != nil { 438 return err 439 } 440 if rule != nil && !rule.ServiceWrite(serviceID.Service, nil) { 441 return acl.ErrPermissionDenied 442 } 443 444 // Verify that the DC in the service URI matches us. We might relax this 445 // requirement later but being restrictive for now is safer. 446 if serviceID.Datacenter != s.srv.config.Datacenter { 447 return fmt.Errorf("SPIFFE ID in CSR from a different datacenter: %s, "+ 448 "we are %s", serviceID.Datacenter, s.srv.config.Datacenter) 449 } 450 451 commonCfg, err := config.GetCommonConfig() 452 if err != nil { 453 return err 454 } 455 if commonCfg.CSRMaxPerSecond > 0 { 456 lim := s.getCSRRateLimiterWithLimit(rate.Limit(commonCfg.CSRMaxPerSecond)) 457 // Wait up to the small threshold we allow for a token. 458 ctx, cancel := context.WithTimeout(context.Background(), csrLimitWait) 459 defer cancel() 460 if lim.Wait(ctx) != nil { 461 return ErrRateLimited 462 } 463 } else if commonCfg.CSRMaxConcurrent > 0 { 464 s.csrConcurrencyLimiter.SetSize(int64(commonCfg.CSRMaxConcurrent)) 465 ctx, cancel := context.WithTimeout(context.Background(), csrLimitWait) 466 defer cancel() 467 if err := s.csrConcurrencyLimiter.Acquire(ctx); err != nil { 468 return ErrRateLimited 469 } 470 defer s.csrConcurrencyLimiter.Release() 471 } 472 473 // All seems to be in order, actually sign it. 474 pem, err := provider.Sign(csr) 475 if err != nil { 476 return err 477 } 478 479 // Append any intermediates needed by this root. 480 for _, p := range caRoot.IntermediateCerts { 481 pem = strings.TrimSpace(pem) + "\n" + p 482 } 483 484 // Append our local CA's intermediate if there is one. 485 inter, err := provider.ActiveIntermediate() 486 if err != nil { 487 return err 488 } 489 root, err := provider.ActiveRoot() 490 if err != nil { 491 return err 492 } 493 494 if inter != root { 495 pem = strings.TrimSpace(pem) + "\n" + inter 496 } 497 498 // TODO(banks): when we implement IssuedCerts table we can use the insert to 499 // that as the raft index to return in response. 500 // 501 // UPDATE(mkeeler): The original implementation relied on updating the CAConfig 502 // and using its index as the ModifyIndex for certs. This was buggy. The long 503 // term goal is still to insert some metadata into raft about the certificates 504 // and use that raft index for the ModifyIndex. This is a partial step in that 505 // direction except that we only are setting an index and not storing the 506 // metadata. 507 req := structs.CALeafRequest{ 508 Op: structs.CALeafOpIncrementIndex, 509 Datacenter: s.srv.config.Datacenter, 510 WriteRequest: structs.WriteRequest{Token: args.Token}, 511 } 512 513 resp, err := s.srv.raftApply(structs.ConnectCALeafRequestType|structs.IgnoreUnknownTypeFlag, &req) 514 if err != nil { 515 return err 516 } 517 518 modIdx, ok := resp.(uint64) 519 if !ok { 520 return fmt.Errorf("Invalid response from updating the leaf cert index") 521 } 522 523 cert, err := connect.ParseCert(pem) 524 if err != nil { 525 return err 526 } 527 528 // Set the response 529 *reply = structs.IssuedCert{ 530 SerialNumber: connect.HexString(cert.SerialNumber.Bytes()), 531 CertPEM: pem, 532 Service: serviceID.Service, 533 ServiceURI: cert.URIs[0].String(), 534 ValidAfter: cert.NotBefore, 535 ValidBefore: cert.NotAfter, 536 RaftIndex: structs.RaftIndex{ 537 ModifyIndex: modIdx, 538 CreateIndex: modIdx, 539 }, 540 } 541 542 return nil 543 }