github.com/manicqin/nomad@v0.9.5/nomad/vault.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "math/rand" 8 "strconv" 9 "sync" 10 "sync/atomic" 11 "time" 12 13 tomb "gopkg.in/tomb.v2" 14 15 metrics "github.com/armon/go-metrics" 16 log "github.com/hashicorp/go-hclog" 17 multierror "github.com/hashicorp/go-multierror" 18 "github.com/hashicorp/nomad/nomad/structs" 19 "github.com/hashicorp/nomad/nomad/structs/config" 20 vapi "github.com/hashicorp/vault/api" 21 "github.com/mitchellh/mapstructure" 22 23 "golang.org/x/sync/errgroup" 24 "golang.org/x/time/rate" 25 ) 26 27 const ( 28 // vaultTokenCreateTTL is the duration the wrapped token for the client is 29 // valid for. The units are in seconds. 30 vaultTokenCreateTTL = "60s" 31 32 // minimumTokenTTL is the minimum Token TTL allowed for child tokens. 33 minimumTokenTTL = 5 * time.Minute 34 35 // defaultTokenTTL is the default Token TTL used when the passed token is a 36 // root token such that child tokens aren't being created against a role 37 // that has defined a TTL 38 defaultTokenTTL = "72h" 39 40 // requestRateLimit is the maximum number of requests per second Nomad will 41 // make against Vault 42 requestRateLimit rate.Limit = 500.0 43 44 // maxParallelRevokes is the maximum number of parallel Vault 45 // token revocation requests 46 maxParallelRevokes = 64 47 48 // vaultRevocationIntv is the interval at which Vault tokens that failed 49 // initial revocation are retried 50 vaultRevocationIntv = 5 * time.Minute 51 52 // vaultCapabilitiesLookupPath is the path to lookup the capabilities of 53 // ones token. 54 vaultCapabilitiesLookupPath = "sys/capabilities-self" 55 56 // vaultTokenRenewPath is the path used to renew our token 57 vaultTokenRenewPath = "auth/token/renew-self" 58 59 // vaultTokenLookupPath is the path used to lookup a token 60 vaultTokenLookupPath = "auth/token/lookup" 61 62 // vaultTokenRevokePath is the path used to revoke a token 63 vaultTokenRevokePath = "auth/token/revoke-accessor" 64 65 // vaultRoleLookupPath is the path to lookup a role 66 vaultRoleLookupPath = "auth/token/roles/%s" 67 68 // vaultRoleCreatePath is the path to create a token from a role 69 vaultTokenRoleCreatePath = "auth/token/create/%s" 70 ) 71 72 var ( 73 // vaultCapabilitiesCapability is the expected capability of Nomad's Vault 74 // token on the the path. The token must have at least one of the 75 // capabilities. 76 vaultCapabilitiesCapability = []string{"update", "root"} 77 78 // vaultTokenRenewCapability is the expected capability Nomad's 79 // Vault token should have on the path. The token must have at least one of 80 // the capabilities. 81 vaultTokenRenewCapability = []string{"update", "root"} 82 83 // vaultTokenLookupCapability is the expected capability Nomad's 84 // Vault token should have on the path. The token must have at least one of 85 // the capabilities. 86 vaultTokenLookupCapability = []string{"update", "root"} 87 88 // vaultTokenRevokeCapability is the expected capability Nomad's 89 // Vault token should have on the path. The token must have at least one of 90 // the capabilities. 91 vaultTokenRevokeCapability = []string{"update", "root"} 92 93 // vaultRoleLookupCapability is the the expected capability Nomad's Vault 94 // token should have on the path. The token must have at least one of the 95 // capabilities. 96 vaultRoleLookupCapability = []string{"read", "root"} 97 98 // vaultTokenRoleCreateCapability is the the expected capability Nomad's Vault 99 // token should have on the path. The token must have at least one of the 100 // capabilities. 101 vaultTokenRoleCreateCapability = []string{"update", "root"} 102 ) 103 104 // VaultClient is the Servers interface for interfacing with Vault 105 type VaultClient interface { 106 // SetActive activates or de-activates the Vault client. When active, token 107 // creation/lookup/revocation operation are allowed. 108 SetActive(active bool) 109 110 // SetConfig updates the config used by the Vault client 111 SetConfig(config *config.VaultConfig) error 112 113 // CreateToken takes an allocation and task and returns an appropriate Vault 114 // Secret 115 CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) 116 117 // LookupToken takes a token string and returns its capabilities. 118 LookupToken(ctx context.Context, token string) (*vapi.Secret, error) 119 120 // RevokeTokens takes a set of tokens accessor and revokes the tokens 121 RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error 122 123 // Stop is used to stop token renewal 124 Stop() 125 126 // Running returns whether the Vault client is running 127 Running() bool 128 129 // Stats returns the Vault clients statistics 130 Stats() map[string]string 131 132 // EmitStats emits that clients statistics at the given period until stopCh 133 // is called. 134 EmitStats(period time.Duration, stopCh <-chan struct{}) 135 } 136 137 // VaultStats returns all the stats about Vault tokens created and managed by 138 // Nomad. 139 type VaultStats struct { 140 // TrackedForRevoke is the count of tokens that are being tracked to be 141 // revoked since they could not be immediately revoked. 142 TrackedForRevoke int 143 144 // TokenTTL is the time-to-live duration for the current token 145 TokenTTL time.Duration 146 147 // TokenExpiry is the recorded expiry time of the current token 148 TokenExpiry time.Time 149 } 150 151 // PurgeVaultAccessor is called to remove VaultAccessors from the system. If 152 // the function returns an error, the token will still be tracked and revocation 153 // will retry till there is a success 154 type PurgeVaultAccessorFn func(accessors []*structs.VaultAccessor) error 155 156 // tokenData holds the relevant information about the Vault token passed to the 157 // client. 158 type tokenData struct { 159 CreationTTL int `mapstructure:"creation_ttl"` 160 TTL int `mapstructure:"ttl"` 161 Renewable bool `mapstructure:"renewable"` 162 Policies []string `mapstructure:"policies"` 163 Role string `mapstructure:"role"` 164 Root bool 165 } 166 167 // vaultClient is the Servers implementation of the VaultClient interface. The 168 // client renews the PeriodicToken given in the Vault configuration and provides 169 // the Server with the ability to create child tokens and lookup the permissions 170 // of tokens. 171 type vaultClient struct { 172 // limiter is used to rate limit requests to Vault 173 limiter *rate.Limiter 174 175 // client is the Vault API client used for Namespace-relative integrations 176 // with the Vault API (anything except `/v1/sys`). If this server is not 177 // configured to reference a Vault namespace, this will point to the same 178 // client as clientSys 179 client *vapi.Client 180 181 // clientSys is the Vault API client used for non-Namespace-relative integrations 182 // with the Vault API (anything involving `/v1/sys`). This client is never configured 183 // with a Vault namespace, because these endpoints may return errors if a namespace 184 // header is provided 185 clientSys *vapi.Client 186 187 // auth is the Vault token auth API client 188 auth *vapi.TokenAuth 189 190 // config is the user passed Vault config 191 config *config.VaultConfig 192 193 // connEstablished marks whether we have an established connection to Vault. 194 connEstablished bool 195 196 // connEstablishedErr marks an error that can occur when establishing a 197 // connection 198 connEstablishedErr error 199 200 // token is the raw token used by the client 201 token string 202 203 // tokenData is the data of the passed Vault token 204 tokenData *tokenData 205 206 // revoking tracks the VaultAccessors that must be revoked 207 revoking map[*structs.VaultAccessor]time.Time 208 purgeFn PurgeVaultAccessorFn 209 revLock sync.Mutex 210 211 // active indicates whether the vaultClient is active. It should be 212 // accessed using a helper and updated atomically 213 active int32 214 215 // running indicates whether the vault client is started. 216 running bool 217 218 // renewLoopActive indicates whether the renewal goroutine is running 219 // It should be accessed and updated atomically 220 // used for testing purposes only 221 renewLoopActive int32 222 223 // childTTL is the TTL for child tokens. 224 childTTL string 225 226 // currentExpiration is the time the current token lease expires 227 currentExpiration time.Time 228 currentExpirationLock sync.Mutex 229 230 tomb *tomb.Tomb 231 logger log.Logger 232 233 // l is used to lock the configuration aspects of the client such that 234 // multiple callers can't cause conflicting config updates 235 l sync.Mutex 236 237 // setConfigLock serializes access to the SetConfig method 238 setConfigLock sync.Mutex 239 } 240 241 // NewVaultClient returns a Vault client from the given config. If the client 242 // couldn't be made an error is returned. 243 func NewVaultClient(c *config.VaultConfig, logger log.Logger, purgeFn PurgeVaultAccessorFn) (*vaultClient, error) { 244 if c == nil { 245 return nil, fmt.Errorf("must pass valid VaultConfig") 246 } 247 248 if logger == nil { 249 return nil, fmt.Errorf("must pass valid logger") 250 } 251 252 v := &vaultClient{ 253 config: c, 254 logger: logger.Named("vault"), 255 limiter: rate.NewLimiter(requestRateLimit, int(requestRateLimit)), 256 revoking: make(map[*structs.VaultAccessor]time.Time), 257 purgeFn: purgeFn, 258 tomb: &tomb.Tomb{}, 259 } 260 261 if v.config.IsEnabled() { 262 if err := v.buildClient(); err != nil { 263 return nil, err 264 } 265 266 // Launch the required goroutines 267 v.tomb.Go(wrapNilError(v.establishConnection)) 268 v.tomb.Go(wrapNilError(v.revokeDaemon)) 269 270 v.running = true 271 } 272 273 return v, nil 274 } 275 276 func (v *vaultClient) Stop() { 277 v.l.Lock() 278 running := v.running 279 v.running = false 280 v.l.Unlock() 281 282 if running { 283 v.tomb.Kill(nil) 284 v.tomb.Wait() 285 v.flush() 286 } 287 } 288 289 func (v *vaultClient) Running() bool { 290 v.l.Lock() 291 defer v.l.Unlock() 292 return v.running 293 } 294 295 // SetActive activates or de-activates the Vault client. When active, token 296 // creation/lookup/revocation operation are allowed. All queued revocations are 297 // cancelled if set un-active as it is assumed another instances is taking over 298 func (v *vaultClient) SetActive(active bool) { 299 if active { 300 atomic.StoreInt32(&v.active, 1) 301 } else { 302 atomic.StoreInt32(&v.active, 0) 303 } 304 305 // Clear out the revoking tokens 306 v.revLock.Lock() 307 v.revoking = make(map[*structs.VaultAccessor]time.Time) 308 v.revLock.Unlock() 309 310 return 311 } 312 313 // flush is used to reset the state of the vault client 314 func (v *vaultClient) flush() { 315 v.l.Lock() 316 defer v.l.Unlock() 317 v.revLock.Lock() 318 defer v.revLock.Unlock() 319 320 v.client = nil 321 v.clientSys = nil 322 v.auth = nil 323 v.connEstablished = false 324 v.connEstablishedErr = nil 325 v.token = "" 326 v.tokenData = nil 327 v.revoking = make(map[*structs.VaultAccessor]time.Time) 328 v.childTTL = "" 329 v.tomb = &tomb.Tomb{} 330 } 331 332 // SetConfig is used to update the Vault config being used. A temporary outage 333 // may occur after calling as it re-establishes a connection to Vault 334 func (v *vaultClient) SetConfig(config *config.VaultConfig) error { 335 if config == nil { 336 return fmt.Errorf("must pass valid VaultConfig") 337 } 338 v.setConfigLock.Lock() 339 defer v.setConfigLock.Unlock() 340 341 v.l.Lock() 342 defer v.l.Unlock() 343 344 // If reloading the same config, no-op 345 if v.config.IsEqual(config) { 346 return nil 347 } 348 349 // Kill any background routines 350 if v.running { 351 // Kill any background routine 352 v.tomb.Kill(nil) 353 354 // Locking around tomb.Wait can deadlock with 355 // establishConnection exiting, so we must unlock here. 356 v.l.Unlock() 357 v.tomb.Wait() 358 v.l.Lock() 359 360 // Stop accepting any new requests 361 v.connEstablished = false 362 v.tomb = &tomb.Tomb{} 363 v.running = false 364 } 365 366 // Store the new config 367 v.config = config 368 369 // Check if we should relaunch 370 if v.config.IsEnabled() { 371 // Rebuild the client 372 if err := v.buildClient(); err != nil { 373 return err 374 } 375 376 // Launch the required goroutines 377 v.tomb.Go(wrapNilError(v.establishConnection)) 378 v.tomb.Go(wrapNilError(v.revokeDaemon)) 379 v.running = true 380 } 381 382 return nil 383 } 384 385 // buildClient is used to build a Vault client based on the stored Vault config 386 func (v *vaultClient) buildClient() error { 387 // Validate we have the required fields. 388 if v.config.Token == "" { 389 return errors.New("Vault token must be set") 390 } else if v.config.Addr == "" { 391 return errors.New("Vault address must be set") 392 } 393 394 // Parse the TTL if it is set 395 if v.config.TaskTokenTTL != "" { 396 d, err := time.ParseDuration(v.config.TaskTokenTTL) 397 if err != nil { 398 return fmt.Errorf("failed to parse TaskTokenTTL %q: %v", v.config.TaskTokenTTL, err) 399 } 400 401 if d.Nanoseconds() < minimumTokenTTL.Nanoseconds() { 402 return fmt.Errorf("ChildTokenTTL is less than minimum allowed of %v", minimumTokenTTL) 403 } 404 405 v.childTTL = v.config.TaskTokenTTL 406 } else { 407 // Default the TaskTokenTTL 408 v.childTTL = defaultTokenTTL 409 } 410 411 // Get the Vault API configuration 412 apiConf, err := v.config.ApiConfig() 413 if err != nil { 414 return fmt.Errorf("Failed to create Vault API config: %v", err) 415 } 416 417 // Create the Vault API client 418 client, err := vapi.NewClient(apiConf) 419 if err != nil { 420 v.logger.Error("failed to create Vault client and not retrying", "error", err) 421 return err 422 } 423 424 // Store the client, create/assign the /sys client 425 v.client = client 426 if v.config.Namespace != "" { 427 v.logger.Debug("configuring Vault namespace", "namespace", v.config.Namespace) 428 v.clientSys, err = vapi.NewClient(apiConf) 429 if err != nil { 430 v.logger.Error("failed to create Vault sys client and not retrying", "error", err) 431 return err 432 } 433 client.SetNamespace(v.config.Namespace) 434 } else { 435 v.clientSys = client 436 } 437 438 // Set the token 439 v.token = v.config.Token 440 client.SetToken(v.token) 441 v.auth = client.Auth().Token() 442 443 return nil 444 } 445 446 // establishConnection is used to make first contact with Vault. This should be 447 // called in a go-routine since the connection is retried until the Vault Client 448 // is stopped or the connection is successfully made at which point the renew 449 // loop is started. 450 func (v *vaultClient) establishConnection() { 451 // Create the retry timer and set initial duration to zero so it fires 452 // immediately 453 retryTimer := time.NewTimer(0) 454 initStatus := false 455 OUTER: 456 for { 457 select { 458 case <-v.tomb.Dying(): 459 return 460 case <-retryTimer.C: 461 // Ensure the API is reachable 462 if !initStatus { 463 if _, err := v.clientSys.Sys().InitStatus(); err != nil { 464 v.logger.Warn("failed to contact Vault API", "retry", v.config.ConnectionRetryIntv, "error", err) 465 retryTimer.Reset(v.config.ConnectionRetryIntv) 466 continue OUTER 467 } 468 initStatus = true 469 } 470 // Retry validating the token till success 471 if err := v.parseSelfToken(); err != nil { 472 v.logger.Error("failed to validate self token/role", "retry", v.config.ConnectionRetryIntv, "error", err) 473 retryTimer.Reset(v.config.ConnectionRetryIntv) 474 v.l.Lock() 475 v.connEstablished = true 476 v.connEstablishedErr = fmt.Errorf("failed to establish connection to Vault: %v", err) 477 v.l.Unlock() 478 continue OUTER 479 } 480 break OUTER 481 } 482 } 483 484 // Set the wrapping function such that token creation is wrapped now 485 // that we know our role 486 v.client.SetWrappingLookupFunc(v.getWrappingFn()) 487 488 // If we are given a non-root token, start renewing it 489 if v.tokenData.Root && v.tokenData.CreationTTL == 0 { 490 v.logger.Debug("not renewing token as it is root") 491 } else { 492 v.logger.Debug("starting renewal loop", "creation_ttl", time.Duration(v.tokenData.CreationTTL)*time.Second) 493 v.tomb.Go(wrapNilError(v.renewalLoop)) 494 } 495 496 v.l.Lock() 497 v.connEstablished = true 498 v.connEstablishedErr = nil 499 v.l.Unlock() 500 } 501 502 func (v *vaultClient) isRenewLoopActive() bool { 503 return atomic.LoadInt32(&v.renewLoopActive) == 1 504 } 505 506 // renewalLoop runs the renew loop. This should only be called if we are given a 507 // non-root token. 508 func (v *vaultClient) renewalLoop() { 509 atomic.StoreInt32(&v.renewLoopActive, 1) 510 defer atomic.StoreInt32(&v.renewLoopActive, 0) 511 512 // Create the renewal timer and set initial duration to zero so it fires 513 // immediately 514 authRenewTimer := time.NewTimer(0) 515 516 // Backoff is to reduce the rate we try to renew with Vault under error 517 // situations 518 backoff := 0.0 519 520 for { 521 select { 522 case <-v.tomb.Dying(): 523 return 524 case <-authRenewTimer.C: 525 // Renew the token and determine the new expiration 526 recoverable, err := v.renew() 527 v.currentExpirationLock.Lock() 528 currentExpiration := v.currentExpiration 529 v.currentExpirationLock.Unlock() 530 531 // Successfully renewed 532 if err == nil { 533 // Attempt to renew the token at half the expiration time 534 durationUntilRenew := currentExpiration.Sub(time.Now()) / 2 535 536 v.logger.Info("successfully renewed token", "next_renewal", durationUntilRenew) 537 authRenewTimer.Reset(durationUntilRenew) 538 539 // Reset any backoff 540 backoff = 0 541 break 542 } 543 544 metrics.IncrCounter([]string{"nomad", "vault", "renew_failed"}, 1) 545 v.logger.Warn("got error or bad auth, so backing off", "error", err, "recoverable", recoverable) 546 547 if !recoverable { 548 return 549 } 550 551 backoff = nextBackoff(backoff, currentExpiration) 552 if backoff < 0 { 553 // We have failed to renew the token past its expiration. Stop 554 // renewing with Vault. 555 v.logger.Error("failed to renew Vault token before lease expiration. Shutting down Vault client", 556 "error", err) 557 v.l.Lock() 558 v.connEstablished = false 559 v.connEstablishedErr = err 560 v.l.Unlock() 561 return 562 } 563 564 durationUntilRetry := time.Duration(backoff) * time.Second 565 v.logger.Info("backing off renewal", "retry", durationUntilRetry) 566 567 authRenewTimer.Reset(durationUntilRetry) 568 } 569 } 570 } 571 572 // nextBackoff returns the delay for the next auto renew interval, in seconds. 573 // Returns negative value if past expiration 574 // 575 // It should increase the amount of backoff each time, with the following rules: 576 // 577 // * If token expired already despite earlier renewal attempts, 578 // back off for 1 minute + jitter 579 // * If we have an existing authentication that is going to expire, 580 // never back off more than half of the amount of time remaining 581 // until expiration (with 5s floor) 582 // * Never back off more than 30 seconds multiplied by a random 583 // value between 1 and 2 584 // * Use randomness so that many clients won't keep hitting Vault 585 // at the same time 586 func nextBackoff(backoff float64, expiry time.Time) float64 { 587 maxBackoff := time.Until(expiry) / 2 588 589 if maxBackoff < 0 { 590 // expiry passed 591 return 60 * (1.0 + rand.Float64()) 592 } 593 594 switch { 595 case backoff >= 24: 596 backoff = 30 597 default: 598 backoff = backoff * 1.25 599 } 600 601 // Add randomness 602 backoff = backoff * (1.0 + rand.Float64()) 603 604 if backoff > maxBackoff.Seconds() { 605 backoff = maxBackoff.Seconds() 606 } 607 608 if backoff < 5 { 609 backoff = 5 610 } 611 612 return backoff 613 } 614 615 // renew attempts to renew our Vault token. If the renewal fails, an error is 616 // returned. The boolean indicates whether it's safe to attempt to renew again. 617 // This method updates the currentExpiration time 618 func (v *vaultClient) renew() (bool, error) { 619 // Track how long the request takes 620 defer metrics.MeasureSince([]string{"nomad", "vault", "renew"}, time.Now()) 621 622 // Attempt to renew the token 623 secret, err := v.auth.RenewSelf(v.tokenData.CreationTTL) 624 if err != nil { 625 // Check if there is a permission denied 626 recoverable := !structs.VaultUnrecoverableError.MatchString(err.Error()) 627 return recoverable, fmt.Errorf("failed to renew the vault token: %v", err) 628 } 629 630 if secret == nil { 631 // It's possible for RenewSelf to return (nil, nil) if the 632 // response body from Vault is empty. 633 return true, fmt.Errorf("renewal failed: empty response from vault") 634 } 635 636 // these treated as transient errors, where can keep renewing 637 auth := secret.Auth 638 if auth == nil { 639 return true, fmt.Errorf("renewal successful but not auth information returned") 640 } else if auth.LeaseDuration == 0 { 641 return true, fmt.Errorf("renewal successful but no lease duration returned") 642 } 643 644 v.extendExpiration(auth.LeaseDuration) 645 646 v.logger.Debug("successfully renewed server token") 647 return true, nil 648 } 649 650 // getWrappingFn returns an appropriate wrapping function for Nomad Servers 651 func (v *vaultClient) getWrappingFn() func(operation, path string) string { 652 createPath := "auth/token/create" 653 role := v.getRole() 654 if role != "" { 655 createPath = fmt.Sprintf("auth/token/create/%s", role) 656 } 657 658 return func(operation, path string) string { 659 // Only wrap the token create operation 660 if operation != "POST" || path != createPath { 661 return "" 662 } 663 664 return vaultTokenCreateTTL 665 } 666 } 667 668 // parseSelfToken looks up the Vault token in Vault and parses its data storing 669 // it in the client. If the token is not valid for Nomads purposes an error is 670 // returned. 671 func (v *vaultClient) parseSelfToken() error { 672 // Try looking up the token using the self endpoint 673 secret, err := v.lookupSelf() 674 if err != nil { 675 return err 676 } 677 678 // Read and parse the fields 679 var data tokenData 680 if err := mapstructure.WeakDecode(secret.Data, &data); err != nil { 681 return fmt.Errorf("failed to parse Vault token's data block: %v", err) 682 } 683 root := false 684 for _, p := range data.Policies { 685 if p == "root" { 686 root = true 687 break 688 } 689 } 690 data.Root = root 691 v.tokenData = &data 692 v.extendExpiration(data.TTL) 693 694 // The criteria that must be met for the token to be valid are as follows: 695 // 1) If token is non-root or is but has a creation ttl 696 // a) The token must be renewable 697 // b) Token must have a non-zero TTL 698 // 2) Must have update capability for "auth/token/lookup/" (used to verify incoming tokens) 699 // 3) Must have update capability for "/auth/token/revoke-accessor/" (used to revoke unneeded tokens) 700 // 4) If configured to create tokens against a role: 701 // a) Must have read capability for "auth/token/roles/<role_name" (Can just attempt a read) 702 // b) Must have update capability for path "auth/token/create/<role_name>" 703 // c) Role must: 704 // 1) Must allow tokens to be renewed 705 // 2) Must not have an explicit max TTL 706 // 3) Must have non-zero period 707 // 5) If not configured against a role, the token must be root 708 709 var mErr multierror.Error 710 role := v.getRole() 711 if !data.Root { 712 // All non-root tokens must be renewable 713 if !data.Renewable { 714 multierror.Append(&mErr, fmt.Errorf("Vault token is not renewable or root")) 715 } 716 717 // All non-root tokens must have a lease duration 718 if data.CreationTTL == 0 { 719 multierror.Append(&mErr, fmt.Errorf("invalid lease duration of zero")) 720 } 721 722 // The lease duration can not be expired 723 if data.TTL == 0 { 724 multierror.Append(&mErr, fmt.Errorf("token TTL is zero")) 725 } 726 727 // There must be a valid role since we aren't root 728 if role == "" { 729 multierror.Append(&mErr, fmt.Errorf("token role name must be set when not using a root token")) 730 } 731 732 } else if data.CreationTTL != 0 { 733 // If the root token has a TTL it must be renewable 734 if !data.Renewable { 735 multierror.Append(&mErr, fmt.Errorf("Vault token has a TTL but is not renewable")) 736 } else if data.TTL == 0 { 737 // If the token has a TTL make sure it has not expired 738 multierror.Append(&mErr, fmt.Errorf("token TTL is zero")) 739 } 740 } 741 742 // Check we have the correct capabilities 743 if err := v.validateCapabilities(role, data.Root); err != nil { 744 multierror.Append(&mErr, err) 745 } 746 747 // If given a role validate it 748 if role != "" { 749 if err := v.validateRole(role); err != nil { 750 multierror.Append(&mErr, err) 751 } 752 } 753 754 return mErr.ErrorOrNil() 755 } 756 757 // lookupSelf is a helper function that looks up latest self lease info. 758 func (v *vaultClient) lookupSelf() (*vapi.Secret, error) { 759 // Get the initial lease duration 760 auth := v.client.Auth().Token() 761 762 secret, err := auth.LookupSelf() 763 if err == nil && secret != nil && secret.Data != nil { 764 return secret, nil 765 } 766 767 // Try looking up our token directly, even when we get an empty response, 768 // in case of an unexpected event - a true failure would occur in this lookup again 769 secret, err = auth.Lookup(v.client.Token()) 770 switch { 771 case err != nil: 772 return nil, fmt.Errorf("failed to lookup Vault periodic token: %v", err) 773 case secret == nil || secret.Data == nil: 774 return nil, fmt.Errorf("failed to lookup Vault periodic token: got empty response") 775 default: 776 return secret, nil 777 } 778 } 779 780 // getRole returns the role name to be used when creating tokens 781 func (v *vaultClient) getRole() string { 782 if v.config.Role != "" { 783 return v.config.Role 784 } 785 786 return v.tokenData.Role 787 } 788 789 // validateCapabilities checks that Nomad's Vault token has the correct 790 // capabilities. 791 func (v *vaultClient) validateCapabilities(role string, root bool) error { 792 // Check if the token can lookup capabilities. 793 var mErr multierror.Error 794 _, _, err := v.hasCapability(vaultCapabilitiesLookupPath, vaultCapabilitiesCapability) 795 if err != nil { 796 // Check if there is a permission denied 797 if structs.VaultUnrecoverableError.MatchString(err.Error()) { 798 // Since we can't read permissions, we just log a warning that we 799 // can't tell if the Vault token will work 800 msg := fmt.Sprintf("can not lookup token capabilities. "+ 801 "As such certain operations may fail in the future. "+ 802 "Please give Nomad a Vault token with one of the following "+ 803 "capabilities %q on %q so that the required capabilities can be verified", 804 vaultCapabilitiesCapability, vaultCapabilitiesLookupPath) 805 v.logger.Warn(msg) 806 return nil 807 } else { 808 multierror.Append(&mErr, err) 809 } 810 } 811 812 // verify is a helper function that verifies the token has one of the 813 // capabilities on the given path and adds an issue to the error 814 verify := func(path string, requiredCaps []string) { 815 ok, caps, err := v.hasCapability(path, requiredCaps) 816 if err != nil { 817 multierror.Append(&mErr, err) 818 } else if !ok { 819 multierror.Append(&mErr, 820 fmt.Errorf("token must have one of the following capabilities %q on %q; has %v", requiredCaps, path, caps)) 821 } 822 } 823 824 // Check if we are verifying incoming tokens 825 if !v.config.AllowsUnauthenticated() { 826 verify(vaultTokenLookupPath, vaultTokenLookupCapability) 827 } 828 829 // Verify we can renew our selves tokens 830 verify(vaultTokenRenewPath, vaultTokenRenewCapability) 831 832 // Verify we can revoke tokens 833 verify(vaultTokenRevokePath, vaultTokenRevokeCapability) 834 835 // If we are using a role verify the capability 836 if role != "" { 837 // Verify we can read the role 838 verify(fmt.Sprintf(vaultRoleLookupPath, role), vaultRoleLookupCapability) 839 840 // Verify we can create from the role 841 verify(fmt.Sprintf(vaultTokenRoleCreatePath, role), vaultTokenRoleCreateCapability) 842 } 843 844 return mErr.ErrorOrNil() 845 } 846 847 // hasCapability takes a path and returns whether the token has at least one of 848 // the required capabilities on the given path. It also returns the set of 849 // capabilities the token does have as well as any error that occurred. 850 func (v *vaultClient) hasCapability(path string, required []string) (bool, []string, error) { 851 caps, err := v.client.Sys().CapabilitiesSelf(path) 852 if err != nil { 853 return false, nil, err 854 } 855 for _, c := range caps { 856 for _, r := range required { 857 if c == r { 858 return true, caps, nil 859 } 860 } 861 } 862 return false, caps, nil 863 } 864 865 // validateRole contacts Vault and checks that the given Vault role is valid for 866 // the purposes of being used by Nomad 867 func (v *vaultClient) validateRole(role string) error { 868 if role == "" { 869 return fmt.Errorf("Invalid empty role name") 870 } 871 872 // Validate the role 873 rsecret, err := v.client.Logical().Read(fmt.Sprintf("auth/token/roles/%s", role)) 874 if err != nil { 875 return fmt.Errorf("failed to lookup role %q: %v", role, err) 876 } 877 if rsecret == nil { 878 return fmt.Errorf("Role %q does not exist", role) 879 } 880 881 // Read and parse the fields 882 var data struct { 883 ExplicitMaxTtl int `mapstructure:"explicit_max_ttl"` 884 TokenExplicitMaxTtl int `mapstructure:"token_explicit_max_ttl"` 885 Orphan bool 886 Period int 887 TokenPeriod int `mapstructure:"token_period"` 888 Renewable bool 889 } 890 if err := mapstructure.WeakDecode(rsecret.Data, &data); err != nil { 891 return fmt.Errorf("failed to parse Vault role's data block: %v", err) 892 } 893 894 // Validate the role is acceptable 895 var mErr multierror.Error 896 if !data.Renewable { 897 multierror.Append(&mErr, fmt.Errorf("Role must allow tokens to be renewed")) 898 } 899 900 if data.ExplicitMaxTtl != 0 || data.TokenExplicitMaxTtl != 0 { 901 multierror.Append(&mErr, fmt.Errorf("Role can not use an explicit max ttl. Token must be periodic.")) 902 } 903 904 if data.Period == 0 && data.TokenPeriod == 0 { 905 multierror.Append(&mErr, fmt.Errorf("Role must have a non-zero period to make tokens periodic.")) 906 } 907 908 return mErr.ErrorOrNil() 909 } 910 911 // ConnectionEstablished returns whether a connection to Vault has been 912 // established and any error that potentially caused it to be false 913 func (v *vaultClient) ConnectionEstablished() (bool, error) { 914 v.l.Lock() 915 defer v.l.Unlock() 916 return v.connEstablished, v.connEstablishedErr 917 } 918 919 // Enabled returns whether the client is active 920 func (v *vaultClient) Enabled() bool { 921 v.l.Lock() 922 defer v.l.Unlock() 923 return v.config.IsEnabled() 924 } 925 926 // Active returns whether the client is active 927 func (v *vaultClient) Active() bool { 928 return atomic.LoadInt32(&v.active) == 1 929 } 930 931 // CreateToken takes the allocation and task and returns an appropriate Vault 932 // token. The call is rate limited and may be canceled with the passed policy. 933 // When the error is recoverable, it will be of type RecoverableError 934 func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) { 935 if !v.Enabled() { 936 return nil, fmt.Errorf("Vault integration disabled") 937 } 938 if !v.Active() { 939 return nil, structs.NewRecoverableError(fmt.Errorf("Vault client not active"), true) 940 } 941 942 // Check if we have established a connection with Vault 943 if established, err := v.ConnectionEstablished(); !established && err == nil { 944 return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) 945 } else if err != nil { 946 return nil, err 947 } 948 949 // Track how long the request takes 950 defer metrics.MeasureSince([]string{"nomad", "vault", "create_token"}, time.Now()) 951 952 // Retrieve the Vault block for the task 953 policies := a.Job.VaultPolicies() 954 if policies == nil { 955 return nil, fmt.Errorf("Job doesn't require Vault policies") 956 } 957 tg, ok := policies[a.TaskGroup] 958 if !ok { 959 return nil, fmt.Errorf("Task group does not require Vault policies") 960 } 961 taskVault, ok := tg[task] 962 if !ok { 963 return nil, fmt.Errorf("Task does not require Vault policies") 964 } 965 966 // Build the creation request 967 req := &vapi.TokenCreateRequest{ 968 Policies: taskVault.Policies, 969 Metadata: map[string]string{ 970 "AllocationID": a.ID, 971 "Task": task, 972 "NodeID": a.NodeID, 973 }, 974 TTL: v.childTTL, 975 DisplayName: fmt.Sprintf("%s-%s", a.ID, task), 976 } 977 978 // Ensure we are under our rate limit 979 if err := v.limiter.Wait(ctx); err != nil { 980 return nil, err 981 } 982 983 // Make the request and switch depending on whether we are using a root 984 // token or a role based token 985 var secret *vapi.Secret 986 var err error 987 role := v.getRole() 988 if v.tokenData.Root && role == "" { 989 req.Period = v.childTTL 990 secret, err = v.auth.Create(req) 991 } else { 992 // Make the token using the role 993 secret, err = v.auth.CreateWithRole(req, v.getRole()) 994 } 995 996 // Determine whether it is unrecoverable 997 if err != nil { 998 err = fmt.Errorf("failed to create an alloc vault token: %v", err) 999 if structs.VaultUnrecoverableError.MatchString(err.Error()) { 1000 return secret, err 1001 } 1002 1003 // The error is recoverable 1004 return nil, structs.NewRecoverableError(err, true) 1005 } 1006 1007 // Validate the response 1008 var validationErr error 1009 if secret == nil { 1010 validationErr = fmt.Errorf("Vault returned nil Secret") 1011 } else if secret.WrapInfo == nil { 1012 validationErr = fmt.Errorf("Vault returned Secret with nil WrapInfo. Secret warnings: %v", secret.Warnings) 1013 } else if secret.WrapInfo.WrappedAccessor == "" { 1014 validationErr = fmt.Errorf("Vault returned WrapInfo without WrappedAccessor. Secret warnings: %v", secret.Warnings) 1015 } 1016 if validationErr != nil { 1017 v.logger.Warn("failed to CreateToken", "error", validationErr) 1018 return nil, structs.NewRecoverableError(validationErr, true) 1019 } 1020 1021 // Got a valid response 1022 return secret, nil 1023 } 1024 1025 // LookupToken takes a Vault token and does a lookup against Vault. The call is 1026 // rate limited and may be canceled with passed context. 1027 func (v *vaultClient) LookupToken(ctx context.Context, token string) (*vapi.Secret, error) { 1028 if !v.Enabled() { 1029 return nil, fmt.Errorf("Vault integration disabled") 1030 } 1031 1032 if !v.Active() { 1033 return nil, fmt.Errorf("Vault client not active") 1034 } 1035 1036 // Check if we have established a connection with Vault 1037 if established, err := v.ConnectionEstablished(); !established && err == nil { 1038 return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) 1039 } else if err != nil { 1040 return nil, err 1041 } 1042 1043 // Track how long the request takes 1044 defer metrics.MeasureSince([]string{"nomad", "vault", "lookup_token"}, time.Now()) 1045 1046 // Ensure we are under our rate limit 1047 if err := v.limiter.Wait(ctx); err != nil { 1048 return nil, err 1049 } 1050 1051 // Lookup the token 1052 return v.auth.Lookup(token) 1053 } 1054 1055 // PoliciesFrom parses the set of policies returned by a token lookup. 1056 func PoliciesFrom(s *vapi.Secret) ([]string, error) { 1057 if s == nil { 1058 return nil, fmt.Errorf("cannot parse nil Vault secret") 1059 } 1060 var data tokenData 1061 if err := mapstructure.WeakDecode(s.Data, &data); err != nil { 1062 return nil, fmt.Errorf("failed to parse Vault token's data block: %v", err) 1063 } 1064 1065 return data.Policies, nil 1066 } 1067 1068 // RevokeTokens revokes the passed set of accessors. If committed is set, the 1069 // purge function passed to the client is called. If there is an error purging 1070 // either because of Vault failures or because of the purge function, the 1071 // revocation is retried until the tokens TTL. 1072 func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error { 1073 if !v.Enabled() { 1074 return nil 1075 } 1076 1077 if !v.Active() { 1078 return fmt.Errorf("Vault client not active") 1079 } 1080 1081 // Track how long the request takes 1082 defer metrics.MeasureSince([]string{"nomad", "vault", "revoke_tokens"}, time.Now()) 1083 1084 // Check if we have established a connection with Vault. If not just add it 1085 // to the queue 1086 if established, err := v.ConnectionEstablished(); !established && err == nil { 1087 // Only bother tracking it for later revocation if the accessor was 1088 // committed 1089 if committed { 1090 v.storeForRevocation(accessors) 1091 } 1092 1093 // Track that we are abandoning these accessors. 1094 metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors))) 1095 return nil 1096 } 1097 1098 // Attempt to revoke immediately and if it fails, add it to the revoke queue 1099 err := v.parallelRevoke(ctx, accessors) 1100 if err != nil { 1101 // If it is uncommitted, it is a best effort revoke as it will shortly 1102 // TTL within the cubbyhole and has not been leaked to any outside 1103 // system 1104 if !committed { 1105 metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors))) 1106 return nil 1107 } 1108 1109 v.logger.Warn("failed to revoke tokens. Will reattempt until TTL", "error", err) 1110 v.storeForRevocation(accessors) 1111 return nil 1112 } else if !committed { 1113 // Mark that it was revoked but there is nothing to purge so exit 1114 metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_revoked"}, float32(len(accessors))) 1115 return nil 1116 } 1117 1118 if err := v.purgeFn(accessors); err != nil { 1119 v.logger.Error("failed to purge Vault accessors", "error", err) 1120 v.storeForRevocation(accessors) 1121 return nil 1122 } 1123 1124 // Track that it was revoked successfully 1125 metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(accessors))) 1126 1127 return nil 1128 } 1129 1130 // storeForRevocation stores the passed set of accessors for revocation. It 1131 // captures their effective TTL by storing their create TTL plus the current 1132 // time. 1133 func (v *vaultClient) storeForRevocation(accessors []*structs.VaultAccessor) { 1134 v.revLock.Lock() 1135 1136 now := time.Now() 1137 for _, a := range accessors { 1138 v.revoking[a] = now.Add(time.Duration(a.CreationTTL) * time.Second) 1139 } 1140 v.revLock.Unlock() 1141 } 1142 1143 // parallelRevoke revokes the passed VaultAccessors in parallel. 1144 func (v *vaultClient) parallelRevoke(ctx context.Context, accessors []*structs.VaultAccessor) error { 1145 if !v.Enabled() { 1146 return fmt.Errorf("Vault integration disabled") 1147 } 1148 1149 if !v.Active() { 1150 return fmt.Errorf("Vault client not active") 1151 } 1152 1153 // Check if we have established a connection with Vault 1154 if established, err := v.ConnectionEstablished(); !established && err == nil { 1155 return structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) 1156 } else if err != nil { 1157 return err 1158 } 1159 1160 g, pCtx := errgroup.WithContext(ctx) 1161 1162 // Cap the handlers 1163 handlers := len(accessors) 1164 if handlers > maxParallelRevokes { 1165 handlers = maxParallelRevokes 1166 } 1167 1168 // Create the Vault Tokens 1169 input := make(chan *structs.VaultAccessor, handlers) 1170 for i := 0; i < handlers; i++ { 1171 g.Go(func() error { 1172 for { 1173 select { 1174 case va, ok := <-input: 1175 if !ok { 1176 return nil 1177 } 1178 1179 if err := v.auth.RevokeAccessor(va.Accessor); err != nil { 1180 return fmt.Errorf("failed to revoke token (alloc: %q, node: %q, task: %q): %v", va.AllocID, va.NodeID, va.Task, err) 1181 } 1182 case <-pCtx.Done(): 1183 return nil 1184 } 1185 } 1186 }) 1187 } 1188 1189 // Send the input 1190 go func() { 1191 defer close(input) 1192 for _, va := range accessors { 1193 select { 1194 case <-pCtx.Done(): 1195 return 1196 case input <- va: 1197 } 1198 } 1199 1200 }() 1201 1202 // Wait for everything to complete 1203 return g.Wait() 1204 } 1205 1206 // revokeDaemon should be called in a goroutine and is used to periodically 1207 // revoke Vault accessors that failed the original revocation 1208 func (v *vaultClient) revokeDaemon() { 1209 ticker := time.NewTicker(vaultRevocationIntv) 1210 defer ticker.Stop() 1211 1212 for { 1213 select { 1214 case <-v.tomb.Dying(): 1215 return 1216 case now := <-ticker.C: 1217 if established, _ := v.ConnectionEstablished(); !established { 1218 continue 1219 } 1220 1221 v.revLock.Lock() 1222 1223 // Fast path 1224 if len(v.revoking) == 0 { 1225 v.revLock.Unlock() 1226 continue 1227 } 1228 1229 // Build the list of allocations that need to revoked while pruning any TTL'd checks 1230 revoking := make([]*structs.VaultAccessor, 0, len(v.revoking)) 1231 for va, ttl := range v.revoking { 1232 if now.After(ttl) { 1233 delete(v.revoking, va) 1234 } else { 1235 revoking = append(revoking, va) 1236 } 1237 } 1238 1239 if err := v.parallelRevoke(context.Background(), revoking); err != nil { 1240 v.logger.Warn("background token revocation errored", "error", err) 1241 v.revLock.Unlock() 1242 continue 1243 } 1244 1245 // Unlock before a potentially expensive operation 1246 v.revLock.Unlock() 1247 1248 // Call the passed in token revocation function 1249 if err := v.purgeFn(revoking); err != nil { 1250 // Can continue since revocation is idempotent 1251 v.logger.Error("token revocation errored", "error", err) 1252 continue 1253 } 1254 1255 // Track that tokens were revoked successfully 1256 metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(revoking))) 1257 1258 // Can delete from the tracked list now that we have purged 1259 v.revLock.Lock() 1260 for _, va := range revoking { 1261 delete(v.revoking, va) 1262 } 1263 v.revLock.Unlock() 1264 1265 } 1266 } 1267 } 1268 1269 // purgeVaultAccessors creates a Raft transaction to remove the passed Vault 1270 // Accessors 1271 func (s *Server) purgeVaultAccessors(accessors []*structs.VaultAccessor) error { 1272 // Commit this update via Raft 1273 req := structs.VaultAccessorsRequest{Accessors: accessors} 1274 _, _, err := s.raftApply(structs.VaultAccessorDeregisterRequestType, req) 1275 return err 1276 } 1277 1278 // wrapNilError is a helper that returns a wrapped function that returns a nil 1279 // error 1280 func wrapNilError(f func()) func() error { 1281 return func() error { 1282 f() 1283 return nil 1284 } 1285 } 1286 1287 // setLimit is used to update the rate limit 1288 func (v *vaultClient) setLimit(l rate.Limit) { 1289 v.l.Lock() 1290 defer v.l.Unlock() 1291 v.limiter = rate.NewLimiter(l, int(l)) 1292 } 1293 1294 func (v *vaultClient) Stats() map[string]string { 1295 stat := v.stats() 1296 1297 expireTimeStr := "" 1298 1299 if !stat.TokenExpiry.IsZero() { 1300 expireTimeStr = stat.TokenExpiry.Format(time.RFC3339) 1301 } 1302 1303 return map[string]string{ 1304 "tracked_for_revoked": strconv.Itoa(stat.TrackedForRevoke), 1305 "token_ttl": stat.TokenTTL.Round(time.Second).String(), 1306 "token_expire_time": expireTimeStr, 1307 } 1308 } 1309 1310 func (v *vaultClient) stats() *VaultStats { 1311 // Allocate a new stats struct 1312 stats := new(VaultStats) 1313 1314 v.revLock.Lock() 1315 stats.TrackedForRevoke = len(v.revoking) 1316 v.revLock.Unlock() 1317 1318 v.currentExpirationLock.Lock() 1319 stats.TokenExpiry = v.currentExpiration 1320 v.currentExpirationLock.Unlock() 1321 1322 if !stats.TokenExpiry.IsZero() { 1323 stats.TokenTTL = time.Until(stats.TokenExpiry) 1324 } 1325 1326 return stats 1327 } 1328 1329 // EmitStats is used to export metrics about the blocked eval tracker while enabled 1330 func (v *vaultClient) EmitStats(period time.Duration, stopCh <-chan struct{}) { 1331 for { 1332 select { 1333 case <-time.After(period): 1334 stats := v.stats() 1335 metrics.SetGauge([]string{"nomad", "vault", "distributed_tokens_revoking"}, float32(stats.TrackedForRevoke)) 1336 metrics.SetGauge([]string{"nomad", "vault", "token_ttl"}, float32(stats.TokenTTL/time.Millisecond)) 1337 1338 case <-stopCh: 1339 return 1340 } 1341 } 1342 } 1343 1344 // extendExpiration sets the current auth token expiration record to ttLSeconds seconds from now 1345 func (v *vaultClient) extendExpiration(ttlSeconds int) { 1346 v.currentExpirationLock.Lock() 1347 v.currentExpiration = time.Now().Add(time.Duration(ttlSeconds) * time.Second) 1348 v.currentExpirationLock.Unlock() 1349 }