github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/nomad/vault.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "log" 8 "math/rand" 9 "regexp" 10 "sync" 11 "sync/atomic" 12 "time" 13 14 "gopkg.in/tomb.v2" 15 16 metrics "github.com/armon/go-metrics" 17 multierror "github.com/hashicorp/go-multierror" 18 "github.com/hashicorp/nomad/nomad/structs" 19 "github.com/hashicorp/nomad/nomad/structs/config" 20 vapi "github.com/hashicorp/vault/api" 21 "github.com/mitchellh/mapstructure" 22 23 "golang.org/x/sync/errgroup" 24 "golang.org/x/time/rate" 25 ) 26 27 const ( 28 // vaultTokenCreateTTL is the duration the wrapped token for the client is 29 // valid for. The units are in seconds. 30 vaultTokenCreateTTL = "60s" 31 32 // minimumTokenTTL is the minimum Token TTL allowed for child tokens. 33 minimumTokenTTL = 5 * time.Minute 34 35 // defaultTokenTTL is the default Token TTL used when the passed token is a 36 // root token such that child tokens aren't being created against a role 37 // that has defined a TTL 38 defaultTokenTTL = "72h" 39 40 // requestRateLimit is the maximum number of requests per second Nomad will 41 // make against Vault 42 requestRateLimit rate.Limit = 500.0 43 44 // maxParallelRevokes is the maximum number of parallel Vault 45 // token revocation requests 46 maxParallelRevokes = 64 47 48 // vaultRevocationIntv is the interval at which Vault tokens that failed 49 // initial revocation are retried 50 vaultRevocationIntv = 5 * time.Minute 51 52 // vaultCapabilitiesLookupPath is the path to lookup the capabilities of 53 // ones token. 54 vaultCapabilitiesLookupPath = "sys/capabilities-self" 55 56 // vaultTokenRenewPath is the path used to renew our token 57 vaultTokenRenewPath = "auth/token/renew-self" 58 59 // vaultTokenLookupPath is the path used to lookup a token 60 vaultTokenLookupPath = "auth/token/lookup" 61 62 // vaultTokenLookupSelfPath is the path used to lookup self token 63 vaultTokenLookupSelfPath = "auth/token/lookup-self" 64 65 // vaultTokenRevokePath is the path used to revoke a token 66 vaultTokenRevokePath = "auth/token/revoke-accessor" 67 68 // vaultRoleLookupPath is the path to lookup a role 69 vaultRoleLookupPath = "auth/token/roles/%s" 70 71 // vaultRoleCreatePath is the path to create a token from a role 72 vaultTokenRoleCreatePath = "auth/token/create/%s" 73 ) 74 75 var ( 76 // vaultUnrecoverableError matches unrecoverable errors 77 vaultUnrecoverableError = regexp.MustCompile(`Code:\s+40(0|3|4)`) 78 79 // vaultCapabilitiesCapability is the expected capability of Nomad's Vault 80 // token on the the path. The token must have at least one of the 81 // capabilities. 82 vaultCapabilitiesCapability = []string{"update", "root"} 83 84 // vaultTokenRenewCapability is the expected capability Nomad's 85 // Vault token should have on the path. The token must have at least one of 86 // the capabilities. 87 vaultTokenRenewCapability = []string{"update", "root"} 88 89 // vaultTokenLookupCapability is the expected capability Nomad's 90 // Vault token should have on the path. The token must have at least one of 91 // the capabilities. 92 vaultTokenLookupCapability = []string{"update", "root"} 93 94 // vaultTokenLookupSelfCapability is the expected capability Nomad's 95 // Vault token should have on the path. The token must have at least one of 96 // the capabilities. 97 vaultTokenLookupSelfCapability = []string{"update", "root"} 98 99 // vaultTokenRevokeCapability is the expected capability Nomad's 100 // Vault token should have on the path. The token must have at least one of 101 // the capabilities. 102 vaultTokenRevokeCapability = []string{"update", "root"} 103 104 // vaultRoleLookupCapability is the the expected capability Nomad's Vault 105 // token should have on the path. The token must have at least one of the 106 // capabilities. 107 vaultRoleLookupCapability = []string{"read", "root"} 108 109 // vaultTokenRoleCreateCapability is the the expected capability Nomad's Vault 110 // token should have on the path. The token must have at least one of the 111 // capabilities. 112 vaultTokenRoleCreateCapability = []string{"update", "root"} 113 ) 114 115 // VaultClient is the Servers interface for interfacing with Vault 116 type VaultClient interface { 117 // SetActive activates or de-activates the Vault client. When active, token 118 // creation/lookup/revocation operation are allowed. 119 SetActive(active bool) 120 121 // SetConfig updates the config used by the Vault client 122 SetConfig(config *config.VaultConfig) error 123 124 // CreateToken takes an allocation and task and returns an appropriate Vault 125 // Secret 126 CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) 127 128 // LookupToken takes a token string and returns its capabilities. 129 LookupToken(ctx context.Context, token string) (*vapi.Secret, error) 130 131 // RevokeTokens takes a set of tokens accessor and revokes the tokens 132 RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error 133 134 // Stop is used to stop token renewal 135 Stop() 136 137 // Running returns whether the Vault client is running 138 Running() bool 139 140 // Stats returns the Vault clients statistics 141 Stats() *VaultStats 142 143 // EmitStats emits that clients statistics at the given period until stopCh 144 // is called. 145 EmitStats(period time.Duration, stopCh chan struct{}) 146 } 147 148 // VaultStats returns all the stats about Vault tokens created and managed by 149 // Nomad. 150 type VaultStats struct { 151 // TrackedForRevoke is the count of tokens that are being tracked to be 152 // revoked since they could not be immediately revoked. 153 TrackedForRevoke int 154 } 155 156 // PurgeVaultAccessor is called to remove VaultAccessors from the system. If 157 // the function returns an error, the token will still be tracked and revocation 158 // will retry till there is a success 159 type PurgeVaultAccessorFn func(accessors []*structs.VaultAccessor) error 160 161 // tokenData holds the relevant information about the Vault token passed to the 162 // client. 163 type tokenData struct { 164 CreationTTL int `mapstructure:"creation_ttl"` 165 TTL int `mapstructure:"ttl"` 166 Renewable bool `mapstructure:"renewable"` 167 Policies []string `mapstructure:"policies"` 168 Role string `mapstructure:"role"` 169 Root bool 170 } 171 172 // vaultClient is the Servers implementation of the VaultClient interface. The 173 // client renews the PeriodicToken given in the Vault configuration and provides 174 // the Server with the ability to create child tokens and lookup the permissions 175 // of tokens. 176 type vaultClient struct { 177 // limiter is used to rate limit requests to Vault 178 limiter *rate.Limiter 179 180 // client is the Vault API client 181 client *vapi.Client 182 183 // auth is the Vault token auth API client 184 auth *vapi.TokenAuth 185 186 // config is the user passed Vault config 187 config *config.VaultConfig 188 189 // connEstablished marks whether we have an established connection to Vault. 190 connEstablished bool 191 192 // connEstablishedErr marks an error that can occur when establishing a 193 // connection 194 connEstablishedErr error 195 196 // token is the raw token used by the client 197 token string 198 199 // tokenData is the data of the passed Vault token 200 tokenData *tokenData 201 202 // revoking tracks the VaultAccessors that must be revoked 203 revoking map[*structs.VaultAccessor]time.Time 204 purgeFn PurgeVaultAccessorFn 205 revLock sync.Mutex 206 207 // active indicates whether the vaultClient is active. It should be 208 // accessed using a helper and updated atomically 209 active int32 210 211 // running indicates whether the vault client is started. 212 running bool 213 214 // childTTL is the TTL for child tokens. 215 childTTL string 216 217 // lastRenewed is the time the token was last renewed 218 lastRenewed time.Time 219 220 tomb *tomb.Tomb 221 logger *log.Logger 222 223 // stats stores the stats 224 stats *VaultStats 225 statsLock sync.RWMutex 226 227 // l is used to lock the configuration aspects of the client such that 228 // multiple callers can't cause conflicting config updates 229 l sync.Mutex 230 } 231 232 // NewVaultClient returns a Vault client from the given config. If the client 233 // couldn't be made an error is returned. 234 func NewVaultClient(c *config.VaultConfig, logger *log.Logger, purgeFn PurgeVaultAccessorFn) (*vaultClient, error) { 235 if c == nil { 236 return nil, fmt.Errorf("must pass valid VaultConfig") 237 } 238 239 if logger == nil { 240 return nil, fmt.Errorf("must pass valid logger") 241 } 242 243 v := &vaultClient{ 244 config: c, 245 logger: logger, 246 limiter: rate.NewLimiter(requestRateLimit, int(requestRateLimit)), 247 revoking: make(map[*structs.VaultAccessor]time.Time), 248 purgeFn: purgeFn, 249 tomb: &tomb.Tomb{}, 250 stats: new(VaultStats), 251 } 252 253 if v.config.IsEnabled() { 254 if err := v.buildClient(); err != nil { 255 return nil, err 256 } 257 258 // Launch the required goroutines 259 v.tomb.Go(wrapNilError(v.establishConnection)) 260 v.tomb.Go(wrapNilError(v.revokeDaemon)) 261 262 v.running = true 263 } 264 265 return v, nil 266 } 267 268 func (v *vaultClient) Stop() { 269 v.l.Lock() 270 running := v.running 271 v.running = false 272 v.l.Unlock() 273 274 if running { 275 v.tomb.Kill(nil) 276 v.tomb.Wait() 277 v.flush() 278 } 279 } 280 281 func (v *vaultClient) Running() bool { 282 v.l.Lock() 283 defer v.l.Unlock() 284 return v.running 285 } 286 287 // SetActive activates or de-activates the Vault client. When active, token 288 // creation/lookup/revocation operation are allowed. All queued revocations are 289 // cancelled if set un-active as it is assumed another instances is taking over 290 func (v *vaultClient) SetActive(active bool) { 291 if active { 292 atomic.StoreInt32(&v.active, 1) 293 } else { 294 atomic.StoreInt32(&v.active, 0) 295 } 296 297 // Clear out the revoking tokens 298 v.revLock.Lock() 299 v.revoking = make(map[*structs.VaultAccessor]time.Time) 300 v.revLock.Unlock() 301 302 return 303 } 304 305 // flush is used to reset the state of the vault client 306 func (v *vaultClient) flush() { 307 v.l.Lock() 308 defer v.l.Unlock() 309 310 v.client = nil 311 v.auth = nil 312 v.connEstablished = false 313 v.connEstablishedErr = nil 314 v.token = "" 315 v.tokenData = nil 316 v.revoking = make(map[*structs.VaultAccessor]time.Time) 317 v.childTTL = "" 318 v.tomb = &tomb.Tomb{} 319 } 320 321 // SetConfig is used to update the Vault config being used. A temporary outage 322 // may occur after calling as it re-establishes a connection to Vault 323 func (v *vaultClient) SetConfig(config *config.VaultConfig) error { 324 if config == nil { 325 return fmt.Errorf("must pass valid VaultConfig") 326 } 327 328 v.l.Lock() 329 defer v.l.Unlock() 330 331 // Kill any background routintes 332 if v.running { 333 // Stop accepting any new request 334 v.connEstablished = false 335 336 // Kill any background routine and create a new tomb 337 v.tomb.Kill(nil) 338 v.tomb.Wait() 339 v.tomb = &tomb.Tomb{} 340 v.running = false 341 } 342 343 // Store the new config 344 v.config = config 345 346 // Check if we should relaunch 347 if v.config.IsEnabled() { 348 // Rebuild the client 349 if err := v.buildClient(); err != nil { 350 return err 351 } 352 353 // Launch the required goroutines 354 v.tomb.Go(wrapNilError(v.establishConnection)) 355 v.tomb.Go(wrapNilError(v.revokeDaemon)) 356 v.running = true 357 } 358 359 return nil 360 } 361 362 // buildClient is used to build a Vault client based on the stored Vault config 363 func (v *vaultClient) buildClient() error { 364 // Validate we have the required fields. 365 if v.config.Token == "" { 366 return errors.New("Vault token must be set") 367 } else if v.config.Addr == "" { 368 return errors.New("Vault address must be set") 369 } 370 371 // Parse the TTL if it is set 372 if v.config.TaskTokenTTL != "" { 373 d, err := time.ParseDuration(v.config.TaskTokenTTL) 374 if err != nil { 375 return fmt.Errorf("failed to parse TaskTokenTTL %q: %v", v.config.TaskTokenTTL, err) 376 } 377 378 if d.Nanoseconds() < minimumTokenTTL.Nanoseconds() { 379 return fmt.Errorf("ChildTokenTTL is less than minimum allowed of %v", minimumTokenTTL) 380 } 381 382 v.childTTL = v.config.TaskTokenTTL 383 } else { 384 // Default the TaskTokenTTL 385 v.childTTL = defaultTokenTTL 386 } 387 388 // Get the Vault API configuration 389 apiConf, err := v.config.ApiConfig() 390 if err != nil { 391 return fmt.Errorf("Failed to create Vault API config: %v", err) 392 } 393 394 // Create the Vault API client 395 client, err := vapi.NewClient(apiConf) 396 if err != nil { 397 v.logger.Printf("[ERR] vault: failed to create Vault client. Not retrying: %v", err) 398 return err 399 } 400 401 // Set the token and store the client 402 v.token = v.config.Token 403 client.SetToken(v.token) 404 v.client = client 405 v.auth = client.Auth().Token() 406 return nil 407 } 408 409 // establishConnection is used to make first contact with Vault. This should be 410 // called in a go-routine since the connection is retried til the Vault Client 411 // is stopped or the connection is successfully made at which point the renew 412 // loop is started. 413 func (v *vaultClient) establishConnection() { 414 // Create the retry timer and set initial duration to zero so it fires 415 // immediately 416 retryTimer := time.NewTimer(0) 417 418 OUTER: 419 for { 420 select { 421 case <-v.tomb.Dying(): 422 return 423 case <-retryTimer.C: 424 // Ensure the API is reachable 425 if _, err := v.client.Sys().InitStatus(); err != nil { 426 v.logger.Printf("[WARN] vault: failed to contact Vault API. Retrying in %v: %v", 427 v.config.ConnectionRetryIntv, err) 428 retryTimer.Reset(v.config.ConnectionRetryIntv) 429 continue OUTER 430 } 431 432 break OUTER 433 } 434 } 435 436 // Retrieve our token, validate it and parse the lease duration 437 if err := v.parseSelfToken(); err != nil { 438 v.logger.Printf("[ERR] vault: failed to validate self token/role and not retrying: %v", err) 439 v.l.Lock() 440 v.connEstablished = false 441 v.connEstablishedErr = err 442 v.l.Unlock() 443 return 444 } 445 446 // Set the wrapping function such that token creation is wrapped now 447 // that we know our role 448 v.client.SetWrappingLookupFunc(v.getWrappingFn()) 449 450 // If we are given a non-root token, start renewing it 451 if v.tokenData.Root && v.tokenData.CreationTTL == 0 { 452 v.logger.Printf("[DEBUG] vault: not renewing token as it is root") 453 } else { 454 v.logger.Printf("[DEBUG] vault: token lease duration is %v", 455 time.Duration(v.tokenData.CreationTTL)*time.Second) 456 v.tomb.Go(wrapNilError(v.renewalLoop)) 457 } 458 459 v.l.Lock() 460 v.connEstablished = true 461 v.connEstablishedErr = nil 462 v.l.Unlock() 463 } 464 465 // renewalLoop runs the renew loop. This should only be called if we are given a 466 // non-root token. 467 func (v *vaultClient) renewalLoop() { 468 // Create the renewal timer and set initial duration to zero so it fires 469 // immediately 470 authRenewTimer := time.NewTimer(0) 471 472 // Backoff is to reduce the rate we try to renew with Vault under error 473 // situations 474 backoff := 0.0 475 476 for { 477 select { 478 case <-v.tomb.Dying(): 479 return 480 case <-authRenewTimer.C: 481 // Renew the token and determine the new expiration 482 err := v.renew() 483 currentExpiration := v.lastRenewed.Add(time.Duration(v.tokenData.CreationTTL) * time.Second) 484 485 // Successfully renewed 486 if err == nil { 487 // If we take the expiration (lastRenewed + auth duration) and 488 // subtract the current time, we get a duration until expiry. 489 // Set the timer to poke us after half of that time is up. 490 durationUntilRenew := currentExpiration.Sub(time.Now()) / 2 491 492 v.logger.Printf("[INFO] vault: renewing token in %v", durationUntilRenew) 493 authRenewTimer.Reset(durationUntilRenew) 494 495 // Reset any backoff 496 backoff = 0 497 break 498 } 499 500 // Back off, increasing the amount of backoff each time. There are some rules: 501 // 502 // * If we have an existing authentication that is going to expire, 503 // never back off more than half of the amount of time remaining 504 // until expiration 505 // * Never back off more than 30 seconds multiplied by a random 506 // value between 1 and 2 507 // * Use randomness so that many clients won't keep hitting Vault 508 // at the same time 509 510 // Set base values and add some backoff 511 512 v.logger.Printf("[WARN] vault: got error or bad auth, so backing off: %v", err) 513 switch { 514 case backoff < 5: 515 backoff = 5 516 case backoff >= 24: 517 backoff = 30 518 default: 519 backoff = backoff * 1.25 520 } 521 522 // Add randomness 523 backoff = backoff * (1.0 + rand.Float64()) 524 525 maxBackoff := currentExpiration.Sub(time.Now()) / 2 526 if maxBackoff < 0 { 527 // We have failed to renew the token past its expiration. Stop 528 // renewing with Vault. 529 v.logger.Printf("[ERR] vault: failed to renew Vault token before lease expiration. Shutting down Vault client") 530 v.l.Lock() 531 v.connEstablished = false 532 v.connEstablishedErr = err 533 v.l.Unlock() 534 return 535 536 } else if backoff > maxBackoff.Seconds() { 537 backoff = maxBackoff.Seconds() 538 } 539 540 durationUntilRetry := time.Duration(backoff) * time.Second 541 v.logger.Printf("[INFO] vault: backing off for %v", durationUntilRetry) 542 543 authRenewTimer.Reset(durationUntilRetry) 544 } 545 } 546 } 547 548 // renew attempts to renew our Vault token. If the renewal fails, an error is 549 // returned. This method updates the lastRenewed time 550 func (v *vaultClient) renew() error { 551 // Attempt to renew the token 552 secret, err := v.auth.RenewSelf(v.tokenData.CreationTTL) 553 if err != nil { 554 return err 555 } 556 557 auth := secret.Auth 558 if auth == nil { 559 return fmt.Errorf("renewal successful but not auth information returned") 560 } else if auth.LeaseDuration == 0 { 561 return fmt.Errorf("renewal successful but no lease duration returned") 562 } 563 564 v.lastRenewed = time.Now() 565 v.logger.Printf("[DEBUG] vault: succesfully renewed server token") 566 return nil 567 } 568 569 // getWrappingFn returns an appropriate wrapping function for Nomad Servers 570 func (v *vaultClient) getWrappingFn() func(operation, path string) string { 571 createPath := "auth/token/create" 572 role := v.getRole() 573 if role != "" { 574 createPath = fmt.Sprintf("auth/token/create/%s", role) 575 } 576 577 return func(operation, path string) string { 578 // Only wrap the token create operation 579 if operation != "POST" || path != createPath { 580 return "" 581 } 582 583 return vaultTokenCreateTTL 584 } 585 } 586 587 // parseSelfToken looks up the Vault token in Vault and parses its data storing 588 // it in the client. If the token is not valid for Nomads purposes an error is 589 // returned. 590 func (v *vaultClient) parseSelfToken() error { 591 // Get the initial lease duration 592 auth := v.client.Auth().Token() 593 var self *vapi.Secret 594 595 // Try looking up the token using the self endpoint 596 secret, err := auth.LookupSelf() 597 if err != nil { 598 // Try looking up our token directly 599 self, err = auth.Lookup(v.client.Token()) 600 if err != nil { 601 return fmt.Errorf("failed to lookup Vault periodic token: %v", err) 602 } 603 } 604 self = secret 605 606 // Read and parse the fields 607 var data tokenData 608 if err := mapstructure.WeakDecode(self.Data, &data); err != nil { 609 return fmt.Errorf("failed to parse Vault token's data block: %v", err) 610 } 611 612 root := false 613 for _, p := range data.Policies { 614 if p == "root" { 615 root = true 616 break 617 } 618 } 619 620 // Store the token data 621 data.Root = root 622 v.tokenData = &data 623 624 // The criteria that must be met for the token to be valid are as follows: 625 // 1) If token is non-root or is but has a creation ttl 626 // a) The token must be renewable 627 // b) Token must have a non-zero TTL 628 // 2) Must have update capability for "auth/token/lookup/" (used to verify incoming tokens) 629 // 3) Must have update capability for "/auth/token/revoke-accessor/" (used to revoke unneeded tokens) 630 // 4) If configured to create tokens against a role: 631 // a) Must have read capability for "auth/token/roles/<role_name" (Can just attemp a read) 632 // b) Must have update capability for path "auth/token/create/<role_name>" 633 // c) Role must: 634 // 1) Not allow orphans 635 // 2) Must allow tokens to be renewed 636 // 3) Must not have an explicit max TTL 637 // 4) Must have non-zero period 638 // 5) If not configured against a role, the token must be root 639 640 var mErr multierror.Error 641 role := v.getRole() 642 if !root { 643 // All non-root tokens must be renewable 644 if !data.Renewable { 645 multierror.Append(&mErr, fmt.Errorf("Vault token is not renewable or root")) 646 } 647 648 // All non-root tokens must have a lease duration 649 if data.CreationTTL == 0 { 650 multierror.Append(&mErr, fmt.Errorf("invalid lease duration of zero")) 651 } 652 653 // The lease duration can not be expired 654 if data.TTL == 0 { 655 multierror.Append(&mErr, fmt.Errorf("token TTL is zero")) 656 } 657 658 // There must be a valid role since we aren't root 659 if role == "" { 660 multierror.Append(&mErr, fmt.Errorf("token role name must be set when not using a root token")) 661 } 662 663 } else if data.CreationTTL != 0 { 664 // If the root token has a TTL it must be renewable 665 if !data.Renewable { 666 multierror.Append(&mErr, fmt.Errorf("Vault token has a TTL but is not renewable")) 667 } else if data.TTL == 0 { 668 // If the token has a TTL make sure it has not expired 669 multierror.Append(&mErr, fmt.Errorf("token TTL is zero")) 670 } 671 } 672 673 // Check we have the correct capabilities 674 if err := v.validateCapabilities(role, root); err != nil { 675 multierror.Append(&mErr, err) 676 } 677 678 // If given a role validate it 679 if role != "" { 680 if err := v.validateRole(role); err != nil { 681 multierror.Append(&mErr, err) 682 } 683 } 684 685 return mErr.ErrorOrNil() 686 } 687 688 // getRole returns the role name to be used when creating tokens 689 func (v *vaultClient) getRole() string { 690 if v.config.Role != "" { 691 return v.config.Role 692 } 693 694 return v.tokenData.Role 695 } 696 697 // validateCapabilities checks that Nomad's Vault token has the correct 698 // capabilities. 699 func (v *vaultClient) validateCapabilities(role string, root bool) error { 700 // Check if the token can lookup capabilities. 701 var mErr multierror.Error 702 _, _, err := v.hasCapability(vaultCapabilitiesLookupPath, vaultCapabilitiesCapability) 703 if err != nil { 704 // Check if there is a permission denied 705 if vaultUnrecoverableError.MatchString(err.Error()) { 706 // Since we can't read permissions, we just log a warning that we 707 // can't tell if the Vault token will work 708 msg := fmt.Sprintf("Can not lookup token capabilities. "+ 709 "As such certain operations may fail in the future. "+ 710 "Please give Nomad a Vault token with one of the following "+ 711 "capabilities %q on %q so that the required capabilities can be verified", 712 vaultCapabilitiesCapability, vaultCapabilitiesLookupPath) 713 v.logger.Printf("[WARN] vault: %s", msg) 714 return nil 715 } else { 716 multierror.Append(&mErr, err) 717 } 718 } 719 720 // verify is a helper function that verifies the token has one of the 721 // capabilities on the given path and adds an issue to the error 722 verify := func(path string, requiredCaps []string) { 723 ok, caps, err := v.hasCapability(path, requiredCaps) 724 if err != nil { 725 multierror.Append(&mErr, err) 726 } else if !ok { 727 multierror.Append(&mErr, 728 fmt.Errorf("token must have one of the following capabilities %q on %q; has %v", requiredCaps, path, caps)) 729 } 730 } 731 732 // Check if we are verifying incoming tokens 733 if !v.config.AllowsUnauthenticated() { 734 verify(vaultTokenLookupPath, vaultTokenLookupCapability) 735 } 736 737 // Verify we can renew our selves tokens 738 verify(vaultTokenRenewPath, vaultTokenRenewCapability) 739 740 // Verify we can revoke tokens 741 verify(vaultTokenRevokePath, vaultTokenRevokeCapability) 742 743 // If we are using a role verify the capability 744 if role != "" { 745 // Verify we can read the role 746 verify(fmt.Sprintf(vaultRoleLookupPath, role), vaultRoleLookupCapability) 747 748 // Verify we can create from the role 749 verify(fmt.Sprintf(vaultTokenRoleCreatePath, role), vaultTokenRoleCreateCapability) 750 } 751 752 return mErr.ErrorOrNil() 753 } 754 755 // hasCapability takes a path and returns whether the token has at least one of 756 // the required capabilities on the given path. It also returns the set of 757 // capabilities the token does have as well as any error that occured. 758 func (v *vaultClient) hasCapability(path string, required []string) (bool, []string, error) { 759 caps, err := v.client.Sys().CapabilitiesSelf(path) 760 if err != nil { 761 return false, nil, err 762 } 763 for _, c := range caps { 764 for _, r := range required { 765 if c == r { 766 return true, caps, nil 767 } 768 } 769 } 770 return false, caps, nil 771 } 772 773 // validateRole contacts Vault and checks that the given Vault role is valid for 774 // the purposes of being used by Nomad 775 func (v *vaultClient) validateRole(role string) error { 776 if role == "" { 777 return fmt.Errorf("Invalid empty role name") 778 } 779 780 // Validate the role 781 rsecret, err := v.client.Logical().Read(fmt.Sprintf("auth/token/roles/%s", role)) 782 if err != nil { 783 return fmt.Errorf("failed to lookup role %q: %v", role, err) 784 } 785 786 // Read and parse the fields 787 var data struct { 788 ExplicitMaxTtl int `mapstructure:"explicit_max_ttl"` 789 Orphan bool 790 Period int 791 Renewable bool 792 } 793 if err := mapstructure.WeakDecode(rsecret.Data, &data); err != nil { 794 return fmt.Errorf("failed to parse Vault role's data block: %v", err) 795 } 796 797 // Validate the role is acceptable 798 var mErr multierror.Error 799 if data.Orphan { 800 multierror.Append(&mErr, fmt.Errorf("Role must not allow orphans")) 801 } 802 803 if !data.Renewable { 804 multierror.Append(&mErr, fmt.Errorf("Role must allow tokens to be renewed")) 805 } 806 807 if data.ExplicitMaxTtl != 0 { 808 multierror.Append(&mErr, fmt.Errorf("Role can not use an explicit max ttl. Token must be periodic.")) 809 } 810 811 if data.Period == 0 { 812 multierror.Append(&mErr, fmt.Errorf("Role must have a non-zero period to make tokens periodic.")) 813 } 814 815 return mErr.ErrorOrNil() 816 } 817 818 // ConnectionEstablished returns whether a connection to Vault has been 819 // established and any error that potentially caused it to be false 820 func (v *vaultClient) ConnectionEstablished() (bool, error) { 821 v.l.Lock() 822 defer v.l.Unlock() 823 return v.connEstablished, v.connEstablishedErr 824 } 825 826 // Enabled returns whether the client is active 827 func (v *vaultClient) Enabled() bool { 828 v.l.Lock() 829 defer v.l.Unlock() 830 return v.config.IsEnabled() 831 } 832 833 // Active returns whether the client is active 834 func (v *vaultClient) Active() bool { 835 return atomic.LoadInt32(&v.active) == 1 836 } 837 838 // CreateToken takes the allocation and task and returns an appropriate Vault 839 // token. The call is rate limited and may be canceled with the passed policy. 840 // When the error is recoverable, it will be of type RecoverableError 841 func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) { 842 if !v.Enabled() { 843 return nil, fmt.Errorf("Vault integration disabled") 844 } 845 if !v.Active() { 846 return nil, structs.NewRecoverableError(fmt.Errorf("Vault client not active"), true) 847 } 848 849 // Check if we have established a connection with Vault 850 if established, err := v.ConnectionEstablished(); !established && err == nil { 851 return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) 852 } else if !established { 853 return nil, fmt.Errorf("Connection to Vault failed: %v", err) 854 } 855 856 // Track how long the request takes 857 defer metrics.MeasureSince([]string{"nomad", "vault", "create_token"}, time.Now()) 858 859 // Retrieve the Vault block for the task 860 policies := a.Job.VaultPolicies() 861 if policies == nil { 862 return nil, fmt.Errorf("Job doesn't require Vault policies") 863 } 864 tg, ok := policies[a.TaskGroup] 865 if !ok { 866 return nil, fmt.Errorf("Task group does not require Vault policies") 867 } 868 taskVault, ok := tg[task] 869 if !ok { 870 return nil, fmt.Errorf("Task does not require Vault policies") 871 } 872 873 // Build the creation request 874 req := &vapi.TokenCreateRequest{ 875 Policies: taskVault.Policies, 876 Metadata: map[string]string{ 877 "AllocationID": a.ID, 878 "Task": task, 879 "NodeID": a.NodeID, 880 }, 881 TTL: v.childTTL, 882 DisplayName: fmt.Sprintf("%s-%s", a.ID, task), 883 } 884 885 // Ensure we are under our rate limit 886 if err := v.limiter.Wait(ctx); err != nil { 887 return nil, err 888 } 889 890 // Make the request and switch depending on whether we are using a root 891 // token or a role based token 892 var secret *vapi.Secret 893 var err error 894 role := v.getRole() 895 if v.tokenData.Root && role == "" { 896 req.Period = v.childTTL 897 secret, err = v.auth.Create(req) 898 } else { 899 // Make the token using the role 900 secret, err = v.auth.CreateWithRole(req, v.getRole()) 901 } 902 903 // Determine whether it is unrecoverable 904 if err != nil { 905 if vaultUnrecoverableError.MatchString(err.Error()) { 906 return secret, err 907 } 908 909 // The error is recoverable 910 return nil, structs.NewRecoverableError(err, true) 911 } 912 913 return secret, nil 914 } 915 916 // LookupToken takes a Vault token and does a lookup against Vault. The call is 917 // rate limited and may be canceled with passed context. 918 func (v *vaultClient) LookupToken(ctx context.Context, token string) (*vapi.Secret, error) { 919 if !v.Enabled() { 920 return nil, fmt.Errorf("Vault integration disabled") 921 } 922 923 if !v.Active() { 924 return nil, fmt.Errorf("Vault client not active") 925 } 926 927 // Check if we have established a connection with Vault 928 if established, err := v.ConnectionEstablished(); !established && err == nil { 929 return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) 930 } else if !established { 931 return nil, fmt.Errorf("Connection to Vault failed: %v", err) 932 } 933 934 // Track how long the request takes 935 defer metrics.MeasureSince([]string{"nomad", "vault", "lookup_token"}, time.Now()) 936 937 // Ensure we are under our rate limit 938 if err := v.limiter.Wait(ctx); err != nil { 939 return nil, err 940 } 941 942 // Lookup the token 943 return v.auth.Lookup(token) 944 } 945 946 // PoliciesFrom parses the set of policies returned by a token lookup. 947 func PoliciesFrom(s *vapi.Secret) ([]string, error) { 948 if s == nil { 949 return nil, fmt.Errorf("cannot parse nil Vault secret") 950 } 951 var data tokenData 952 if err := mapstructure.WeakDecode(s.Data, &data); err != nil { 953 return nil, fmt.Errorf("failed to parse Vault token's data block: %v", err) 954 } 955 956 return data.Policies, nil 957 } 958 959 // RevokeTokens revokes the passed set of accessors. If committed is set, the 960 // purge function passed to the client is called. If there is an error purging 961 // either because of Vault failures or because of the purge function, the 962 // revocation is retried until the tokens TTL. 963 func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error { 964 if !v.Enabled() { 965 return nil 966 } 967 968 if !v.Active() { 969 return fmt.Errorf("Vault client not active") 970 } 971 972 // Track how long the request takes 973 defer metrics.MeasureSince([]string{"nomad", "vault", "revoke_tokens"}, time.Now()) 974 975 // Check if we have established a connection with Vault. If not just add it 976 // to the queue 977 if established, err := v.ConnectionEstablished(); !established && err == nil { 978 // Only bother tracking it for later revocation if the accessor was 979 // committed 980 if committed { 981 v.storeForRevocation(accessors) 982 } 983 984 // Track that we are abandoning these accessors. 985 metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors))) 986 return nil 987 } 988 989 // Attempt to revoke immediately and if it fails, add it to the revoke queue 990 err := v.parallelRevoke(ctx, accessors) 991 if err != nil { 992 // If it is uncommitted, it is a best effort revoke as it will shortly 993 // TTL within the cubbyhole and has not been leaked to any outside 994 // system 995 if !committed { 996 metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors))) 997 return nil 998 } 999 1000 v.logger.Printf("[WARN] vault: failed to revoke tokens. Will reattempt til TTL: %v", err) 1001 v.storeForRevocation(accessors) 1002 return nil 1003 } else if !committed { 1004 // Mark that it was revoked but there is nothing to purge so exit 1005 metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_revoked"}, float32(len(accessors))) 1006 return nil 1007 } 1008 1009 if err := v.purgeFn(accessors); err != nil { 1010 v.logger.Printf("[ERR] vault: failed to purge Vault accessors: %v", err) 1011 v.storeForRevocation(accessors) 1012 return nil 1013 } 1014 1015 // Track that it was revoked successfully 1016 metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(accessors))) 1017 1018 return nil 1019 } 1020 1021 // storeForRevocation stores the passed set of accessors for revocation. It 1022 // captrues their effective TTL by storing their create TTL plus the current 1023 // time. 1024 func (v *vaultClient) storeForRevocation(accessors []*structs.VaultAccessor) { 1025 v.revLock.Lock() 1026 v.statsLock.Lock() 1027 now := time.Now() 1028 for _, a := range accessors { 1029 v.revoking[a] = now.Add(time.Duration(a.CreationTTL) * time.Second) 1030 } 1031 v.stats.TrackedForRevoke = len(v.revoking) 1032 v.statsLock.Unlock() 1033 v.revLock.Unlock() 1034 } 1035 1036 // parallelRevoke revokes the passed VaultAccessors in parallel. 1037 func (v *vaultClient) parallelRevoke(ctx context.Context, accessors []*structs.VaultAccessor) error { 1038 if !v.Enabled() { 1039 return fmt.Errorf("Vault integration disabled") 1040 } 1041 1042 if !v.Active() { 1043 return fmt.Errorf("Vault client not active") 1044 } 1045 1046 // Check if we have established a connection with Vault 1047 if established, err := v.ConnectionEstablished(); !established && err == nil { 1048 return structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) 1049 } else if !established { 1050 return fmt.Errorf("Connection to Vault failed: %v", err) 1051 } 1052 1053 g, pCtx := errgroup.WithContext(ctx) 1054 1055 // Cap the handlers 1056 handlers := len(accessors) 1057 if handlers > maxParallelRevokes { 1058 handlers = maxParallelRevokes 1059 } 1060 1061 // Create the Vault Tokens 1062 input := make(chan *structs.VaultAccessor, handlers) 1063 for i := 0; i < handlers; i++ { 1064 g.Go(func() error { 1065 for { 1066 select { 1067 case va, ok := <-input: 1068 if !ok { 1069 return nil 1070 } 1071 1072 if err := v.auth.RevokeAccessor(va.Accessor); err != nil { 1073 return fmt.Errorf("failed to revoke token (alloc: %q, node: %q, task: %q): %v", va.AllocID, va.NodeID, va.Task, err) 1074 } 1075 case <-pCtx.Done(): 1076 return nil 1077 } 1078 } 1079 }) 1080 } 1081 1082 // Send the input 1083 go func() { 1084 defer close(input) 1085 for _, va := range accessors { 1086 select { 1087 case <-pCtx.Done(): 1088 return 1089 case input <- va: 1090 } 1091 } 1092 1093 }() 1094 1095 // Wait for everything to complete 1096 return g.Wait() 1097 } 1098 1099 // revokeDaemon should be called in a goroutine and is used to periodically 1100 // revoke Vault accessors that failed the original revocation 1101 func (v *vaultClient) revokeDaemon() { 1102 ticker := time.NewTicker(vaultRevocationIntv) 1103 defer ticker.Stop() 1104 1105 for { 1106 select { 1107 case <-v.tomb.Dying(): 1108 return 1109 case now := <-ticker.C: 1110 if established, _ := v.ConnectionEstablished(); !established { 1111 continue 1112 } 1113 1114 v.revLock.Lock() 1115 1116 // Fast path 1117 if len(v.revoking) == 0 { 1118 v.revLock.Unlock() 1119 continue 1120 } 1121 1122 // Build the list of allocations that need to revoked while pruning any TTL'd checks 1123 revoking := make([]*structs.VaultAccessor, 0, len(v.revoking)) 1124 for va, ttl := range v.revoking { 1125 if now.After(ttl) { 1126 delete(v.revoking, va) 1127 } else { 1128 revoking = append(revoking, va) 1129 } 1130 } 1131 1132 if err := v.parallelRevoke(context.Background(), revoking); err != nil { 1133 v.logger.Printf("[WARN] vault: background token revocation errored: %v", err) 1134 v.revLock.Unlock() 1135 continue 1136 } 1137 1138 // Unlock before a potentially expensive operation 1139 v.revLock.Unlock() 1140 1141 // Call the passed in token revocation function 1142 if err := v.purgeFn(revoking); err != nil { 1143 // Can continue since revocation is idempotent 1144 v.logger.Printf("[ERR] vault: token revocation errored: %v", err) 1145 continue 1146 } 1147 1148 // Track that tokens were revoked successfully 1149 metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(revoking))) 1150 1151 // Can delete from the tracked list now that we have purged 1152 v.revLock.Lock() 1153 v.statsLock.Lock() 1154 for _, va := range revoking { 1155 delete(v.revoking, va) 1156 } 1157 v.stats.TrackedForRevoke = len(v.revoking) 1158 v.statsLock.Unlock() 1159 v.revLock.Unlock() 1160 1161 } 1162 } 1163 } 1164 1165 // purgeVaultAccessors creates a Raft transaction to remove the passed Vault 1166 // Accessors 1167 func (s *Server) purgeVaultAccessors(accessors []*structs.VaultAccessor) error { 1168 // Commit this update via Raft 1169 req := structs.VaultAccessorsRequest{Accessors: accessors} 1170 _, _, err := s.raftApply(structs.VaultAccessorDegisterRequestType, req) 1171 return err 1172 } 1173 1174 // wrapNilError is a helper that returns a wrapped function that returns a nil 1175 // error 1176 func wrapNilError(f func()) func() error { 1177 return func() error { 1178 f() 1179 return nil 1180 } 1181 } 1182 1183 // setLimit is used to update the rate limit 1184 func (v *vaultClient) setLimit(l rate.Limit) { 1185 v.l.Lock() 1186 defer v.l.Unlock() 1187 v.limiter = rate.NewLimiter(l, int(l)) 1188 } 1189 1190 // Stats is used to query the state of the blocked eval tracker. 1191 func (v *vaultClient) Stats() *VaultStats { 1192 // Allocate a new stats struct 1193 stats := new(VaultStats) 1194 1195 v.statsLock.RLock() 1196 defer v.statsLock.RUnlock() 1197 1198 // Copy all the stats 1199 stats.TrackedForRevoke = v.stats.TrackedForRevoke 1200 1201 return stats 1202 } 1203 1204 // EmitStats is used to export metrics about the blocked eval tracker while enabled 1205 func (v *vaultClient) EmitStats(period time.Duration, stopCh chan struct{}) { 1206 for { 1207 select { 1208 case <-time.After(period): 1209 stats := v.Stats() 1210 metrics.SetGauge([]string{"nomad", "vault", "distributed_tokens_revoking"}, float32(stats.TrackedForRevoke)) 1211 case <-stopCh: 1212 return 1213 } 1214 } 1215 }