github.com/emate/nomad@v0.8.2-wo-binpacking/nomad/vault.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "log" 8 "math/rand" 9 "sync" 10 "sync/atomic" 11 "time" 12 13 "gopkg.in/tomb.v2" 14 15 metrics "github.com/armon/go-metrics" 16 multierror "github.com/hashicorp/go-multierror" 17 "github.com/hashicorp/nomad/nomad/structs" 18 "github.com/hashicorp/nomad/nomad/structs/config" 19 vapi "github.com/hashicorp/vault/api" 20 "github.com/mitchellh/mapstructure" 21 22 "golang.org/x/sync/errgroup" 23 "golang.org/x/time/rate" 24 ) 25 26 const ( 27 // vaultTokenCreateTTL is the duration the wrapped token for the client is 28 // valid for. The units are in seconds. 29 vaultTokenCreateTTL = "60s" 30 31 // minimumTokenTTL is the minimum Token TTL allowed for child tokens. 32 minimumTokenTTL = 5 * time.Minute 33 34 // defaultTokenTTL is the default Token TTL used when the passed token is a 35 // root token such that child tokens aren't being created against a role 36 // that has defined a TTL 37 defaultTokenTTL = "72h" 38 39 // requestRateLimit is the maximum number of requests per second Nomad will 40 // make against Vault 41 requestRateLimit rate.Limit = 500.0 42 43 // maxParallelRevokes is the maximum number of parallel Vault 44 // token revocation requests 45 maxParallelRevokes = 64 46 47 // vaultRevocationIntv is the interval at which Vault tokens that failed 48 // initial revocation are retried 49 vaultRevocationIntv = 5 * time.Minute 50 51 // vaultCapabilitiesLookupPath is the path to lookup the capabilities of 52 // ones token. 53 vaultCapabilitiesLookupPath = "sys/capabilities-self" 54 55 // vaultTokenRenewPath is the path used to renew our token 56 vaultTokenRenewPath = "auth/token/renew-self" 57 58 // vaultTokenLookupPath is the path used to lookup a token 59 vaultTokenLookupPath = "auth/token/lookup" 60 61 // vaultTokenRevokePath is the path used to revoke a token 62 vaultTokenRevokePath = "auth/token/revoke-accessor" 63 64 // vaultRoleLookupPath is the path to lookup a role 65 vaultRoleLookupPath = "auth/token/roles/%s" 66 67 // vaultRoleCreatePath is the path to create a token from a role 68 vaultTokenRoleCreatePath = "auth/token/create/%s" 69 ) 70 71 var ( 72 // vaultCapabilitiesCapability is the expected capability of Nomad's Vault 73 // token on the the path. The token must have at least one of the 74 // capabilities. 75 vaultCapabilitiesCapability = []string{"update", "root"} 76 77 // vaultTokenRenewCapability is the expected capability Nomad's 78 // Vault token should have on the path. The token must have at least one of 79 // the capabilities. 80 vaultTokenRenewCapability = []string{"update", "root"} 81 82 // vaultTokenLookupCapability is the expected capability Nomad's 83 // Vault token should have on the path. The token must have at least one of 84 // the capabilities. 85 vaultTokenLookupCapability = []string{"update", "root"} 86 87 // vaultTokenRevokeCapability is the expected capability Nomad's 88 // Vault token should have on the path. The token must have at least one of 89 // the capabilities. 90 vaultTokenRevokeCapability = []string{"update", "root"} 91 92 // vaultRoleLookupCapability is the the expected capability Nomad's Vault 93 // token should have on the path. The token must have at least one of the 94 // capabilities. 95 vaultRoleLookupCapability = []string{"read", "root"} 96 97 // vaultTokenRoleCreateCapability is the the expected capability Nomad's Vault 98 // token should have on the path. The token must have at least one of the 99 // capabilities. 100 vaultTokenRoleCreateCapability = []string{"update", "root"} 101 ) 102 103 // VaultClient is the Servers interface for interfacing with Vault 104 type VaultClient interface { 105 // SetActive activates or de-activates the Vault client. When active, token 106 // creation/lookup/revocation operation are allowed. 107 SetActive(active bool) 108 109 // SetConfig updates the config used by the Vault client 110 SetConfig(config *config.VaultConfig) error 111 112 // CreateToken takes an allocation and task and returns an appropriate Vault 113 // Secret 114 CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) 115 116 // LookupToken takes a token string and returns its capabilities. 117 LookupToken(ctx context.Context, token string) (*vapi.Secret, error) 118 119 // RevokeTokens takes a set of tokens accessor and revokes the tokens 120 RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error 121 122 // Stop is used to stop token renewal 123 Stop() 124 125 // Running returns whether the Vault client is running 126 Running() bool 127 128 // Stats returns the Vault clients statistics 129 Stats() *VaultStats 130 131 // EmitStats emits that clients statistics at the given period until stopCh 132 // is called. 133 EmitStats(period time.Duration, stopCh chan struct{}) 134 } 135 136 // VaultStats returns all the stats about Vault tokens created and managed by 137 // Nomad. 138 type VaultStats struct { 139 // TrackedForRevoke is the count of tokens that are being tracked to be 140 // revoked since they could not be immediately revoked. 141 TrackedForRevoke int 142 } 143 144 // PurgeVaultAccessor is called to remove VaultAccessors from the system. If 145 // the function returns an error, the token will still be tracked and revocation 146 // will retry till there is a success 147 type PurgeVaultAccessorFn func(accessors []*structs.VaultAccessor) error 148 149 // tokenData holds the relevant information about the Vault token passed to the 150 // client. 151 type tokenData struct { 152 CreationTTL int `mapstructure:"creation_ttl"` 153 TTL int `mapstructure:"ttl"` 154 Renewable bool `mapstructure:"renewable"` 155 Policies []string `mapstructure:"policies"` 156 Role string `mapstructure:"role"` 157 Root bool 158 } 159 160 // vaultClient is the Servers implementation of the VaultClient interface. The 161 // client renews the PeriodicToken given in the Vault configuration and provides 162 // the Server with the ability to create child tokens and lookup the permissions 163 // of tokens. 164 type vaultClient struct { 165 // limiter is used to rate limit requests to Vault 166 limiter *rate.Limiter 167 168 // client is the Vault API client 169 client *vapi.Client 170 171 // auth is the Vault token auth API client 172 auth *vapi.TokenAuth 173 174 // config is the user passed Vault config 175 config *config.VaultConfig 176 177 // connEstablished marks whether we have an established connection to Vault. 178 connEstablished bool 179 180 // connEstablishedErr marks an error that can occur when establishing a 181 // connection 182 connEstablishedErr error 183 184 // token is the raw token used by the client 185 token string 186 187 // tokenData is the data of the passed Vault token 188 tokenData *tokenData 189 190 // revoking tracks the VaultAccessors that must be revoked 191 revoking map[*structs.VaultAccessor]time.Time 192 purgeFn PurgeVaultAccessorFn 193 revLock sync.Mutex 194 195 // active indicates whether the vaultClient is active. It should be 196 // accessed using a helper and updated atomically 197 active int32 198 199 // running indicates whether the vault client is started. 200 running bool 201 202 // childTTL is the TTL for child tokens. 203 childTTL string 204 205 // lastRenewed is the time the token was last renewed 206 lastRenewed time.Time 207 208 tomb *tomb.Tomb 209 logger *log.Logger 210 211 // stats stores the stats 212 stats *VaultStats 213 statsLock sync.RWMutex 214 215 // l is used to lock the configuration aspects of the client such that 216 // multiple callers can't cause conflicting config updates 217 l sync.Mutex 218 } 219 220 // NewVaultClient returns a Vault client from the given config. If the client 221 // couldn't be made an error is returned. 222 func NewVaultClient(c *config.VaultConfig, logger *log.Logger, purgeFn PurgeVaultAccessorFn) (*vaultClient, error) { 223 if c == nil { 224 return nil, fmt.Errorf("must pass valid VaultConfig") 225 } 226 227 if logger == nil { 228 return nil, fmt.Errorf("must pass valid logger") 229 } 230 231 v := &vaultClient{ 232 config: c, 233 logger: logger, 234 limiter: rate.NewLimiter(requestRateLimit, int(requestRateLimit)), 235 revoking: make(map[*structs.VaultAccessor]time.Time), 236 purgeFn: purgeFn, 237 tomb: &tomb.Tomb{}, 238 stats: new(VaultStats), 239 } 240 241 if v.config.IsEnabled() { 242 if err := v.buildClient(); err != nil { 243 return nil, err 244 } 245 246 // Launch the required goroutines 247 v.tomb.Go(wrapNilError(v.establishConnection)) 248 v.tomb.Go(wrapNilError(v.revokeDaemon)) 249 250 v.running = true 251 } 252 253 return v, nil 254 } 255 256 func (v *vaultClient) Stop() { 257 v.l.Lock() 258 running := v.running 259 v.running = false 260 v.l.Unlock() 261 262 if running { 263 v.tomb.Kill(nil) 264 v.tomb.Wait() 265 v.flush() 266 } 267 } 268 269 func (v *vaultClient) Running() bool { 270 v.l.Lock() 271 defer v.l.Unlock() 272 return v.running 273 } 274 275 // SetActive activates or de-activates the Vault client. When active, token 276 // creation/lookup/revocation operation are allowed. All queued revocations are 277 // cancelled if set un-active as it is assumed another instances is taking over 278 func (v *vaultClient) SetActive(active bool) { 279 if active { 280 atomic.StoreInt32(&v.active, 1) 281 } else { 282 atomic.StoreInt32(&v.active, 0) 283 } 284 285 // Clear out the revoking tokens 286 v.revLock.Lock() 287 v.revoking = make(map[*structs.VaultAccessor]time.Time) 288 v.revLock.Unlock() 289 290 return 291 } 292 293 // flush is used to reset the state of the vault client 294 func (v *vaultClient) flush() { 295 v.l.Lock() 296 defer v.l.Unlock() 297 298 v.client = nil 299 v.auth = nil 300 v.connEstablished = false 301 v.connEstablishedErr = nil 302 v.token = "" 303 v.tokenData = nil 304 v.revoking = make(map[*structs.VaultAccessor]time.Time) 305 v.childTTL = "" 306 v.tomb = &tomb.Tomb{} 307 } 308 309 // SetConfig is used to update the Vault config being used. A temporary outage 310 // may occur after calling as it re-establishes a connection to Vault 311 func (v *vaultClient) SetConfig(config *config.VaultConfig) error { 312 if config == nil { 313 return fmt.Errorf("must pass valid VaultConfig") 314 } 315 316 v.l.Lock() 317 defer v.l.Unlock() 318 319 // Kill any background routines 320 if v.running { 321 // Stop accepting any new request 322 v.connEstablished = false 323 324 // Kill any background routine and create a new tomb 325 v.tomb.Kill(nil) 326 v.tomb.Wait() 327 v.tomb = &tomb.Tomb{} 328 v.running = false 329 } 330 331 // Store the new config 332 v.config = config 333 334 // Check if we should relaunch 335 if v.config.IsEnabled() { 336 // Rebuild the client 337 if err := v.buildClient(); err != nil { 338 return err 339 } 340 341 // Launch the required goroutines 342 v.tomb.Go(wrapNilError(v.establishConnection)) 343 v.tomb.Go(wrapNilError(v.revokeDaemon)) 344 v.running = true 345 } 346 347 return nil 348 } 349 350 // buildClient is used to build a Vault client based on the stored Vault config 351 func (v *vaultClient) buildClient() error { 352 // Validate we have the required fields. 353 if v.config.Token == "" { 354 return errors.New("Vault token must be set") 355 } else if v.config.Addr == "" { 356 return errors.New("Vault address must be set") 357 } 358 359 // Parse the TTL if it is set 360 if v.config.TaskTokenTTL != "" { 361 d, err := time.ParseDuration(v.config.TaskTokenTTL) 362 if err != nil { 363 return fmt.Errorf("failed to parse TaskTokenTTL %q: %v", v.config.TaskTokenTTL, err) 364 } 365 366 if d.Nanoseconds() < minimumTokenTTL.Nanoseconds() { 367 return fmt.Errorf("ChildTokenTTL is less than minimum allowed of %v", minimumTokenTTL) 368 } 369 370 v.childTTL = v.config.TaskTokenTTL 371 } else { 372 // Default the TaskTokenTTL 373 v.childTTL = defaultTokenTTL 374 } 375 376 // Get the Vault API configuration 377 apiConf, err := v.config.ApiConfig() 378 if err != nil { 379 return fmt.Errorf("Failed to create Vault API config: %v", err) 380 } 381 382 // Create the Vault API client 383 client, err := vapi.NewClient(apiConf) 384 if err != nil { 385 v.logger.Printf("[ERR] vault: failed to create Vault client. Not retrying: %v", err) 386 return err 387 } 388 389 // Set the token and store the client 390 v.token = v.config.Token 391 client.SetToken(v.token) 392 v.client = client 393 v.auth = client.Auth().Token() 394 return nil 395 } 396 397 // establishConnection is used to make first contact with Vault. This should be 398 // called in a go-routine since the connection is retried until the Vault Client 399 // is stopped or the connection is successfully made at which point the renew 400 // loop is started. 401 func (v *vaultClient) establishConnection() { 402 // Create the retry timer and set initial duration to zero so it fires 403 // immediately 404 retryTimer := time.NewTimer(0) 405 initStatus := false 406 OUTER: 407 for { 408 select { 409 case <-v.tomb.Dying(): 410 return 411 case <-retryTimer.C: 412 // Ensure the API is reachable 413 if !initStatus { 414 if _, err := v.client.Sys().InitStatus(); err != nil { 415 v.logger.Printf("[WARN] vault: failed to contact Vault API. Retrying in %v: %v", 416 v.config.ConnectionRetryIntv, err) 417 retryTimer.Reset(v.config.ConnectionRetryIntv) 418 continue OUTER 419 } 420 initStatus = true 421 } 422 // Retry validating the token till success 423 if err := v.parseSelfToken(); err != nil { 424 v.logger.Printf("[ERR] vault: failed to validate self token/role. Retrying in %v: %v", v.config.ConnectionRetryIntv, err) 425 retryTimer.Reset(v.config.ConnectionRetryIntv) 426 v.l.Lock() 427 v.connEstablished = true 428 v.connEstablishedErr = fmt.Errorf("Nomad Server failed to establish connections to Vault: %v", err) 429 v.l.Unlock() 430 continue OUTER 431 } 432 break OUTER 433 } 434 } 435 436 // Set the wrapping function such that token creation is wrapped now 437 // that we know our role 438 v.client.SetWrappingLookupFunc(v.getWrappingFn()) 439 440 // If we are given a non-root token, start renewing it 441 if v.tokenData.Root && v.tokenData.CreationTTL == 0 { 442 v.logger.Printf("[DEBUG] vault: not renewing token as it is root") 443 } else { 444 v.logger.Printf("[DEBUG] vault: token lease duration is %v", 445 time.Duration(v.tokenData.CreationTTL)*time.Second) 446 v.tomb.Go(wrapNilError(v.renewalLoop)) 447 } 448 449 v.l.Lock() 450 v.connEstablished = true 451 v.connEstablishedErr = nil 452 v.l.Unlock() 453 } 454 455 // renewalLoop runs the renew loop. This should only be called if we are given a 456 // non-root token. 457 func (v *vaultClient) renewalLoop() { 458 // Create the renewal timer and set initial duration to zero so it fires 459 // immediately 460 authRenewTimer := time.NewTimer(0) 461 462 // Backoff is to reduce the rate we try to renew with Vault under error 463 // situations 464 backoff := 0.0 465 466 for { 467 select { 468 case <-v.tomb.Dying(): 469 return 470 case <-authRenewTimer.C: 471 // Renew the token and determine the new expiration 472 err := v.renew() 473 currentExpiration := v.lastRenewed.Add(time.Duration(v.tokenData.CreationTTL) * time.Second) 474 475 // Successfully renewed 476 if err == nil { 477 // If we take the expiration (lastRenewed + auth duration) and 478 // subtract the current time, we get a duration until expiry. 479 // Set the timer to poke us after half of that time is up. 480 durationUntilRenew := currentExpiration.Sub(time.Now()) / 2 481 482 v.logger.Printf("[INFO] vault: renewing token in %v", durationUntilRenew) 483 authRenewTimer.Reset(durationUntilRenew) 484 485 // Reset any backoff 486 backoff = 0 487 break 488 } 489 490 // Back off, increasing the amount of backoff each time. There are some rules: 491 // 492 // * If we have an existing authentication that is going to expire, 493 // never back off more than half of the amount of time remaining 494 // until expiration 495 // * Never back off more than 30 seconds multiplied by a random 496 // value between 1 and 2 497 // * Use randomness so that many clients won't keep hitting Vault 498 // at the same time 499 500 // Set base values and add some backoff 501 502 v.logger.Printf("[WARN] vault: got error or bad auth, so backing off: %v", err) 503 switch { 504 case backoff < 5: 505 backoff = 5 506 case backoff >= 24: 507 backoff = 30 508 default: 509 backoff = backoff * 1.25 510 } 511 512 // Add randomness 513 backoff = backoff * (1.0 + rand.Float64()) 514 515 maxBackoff := currentExpiration.Sub(time.Now()) / 2 516 if maxBackoff < 0 { 517 // We have failed to renew the token past its expiration. Stop 518 // renewing with Vault. 519 v.logger.Printf("[ERR] vault: failed to renew Vault token before lease expiration. Shutting down Vault client") 520 v.l.Lock() 521 v.connEstablished = false 522 v.connEstablishedErr = err 523 v.l.Unlock() 524 return 525 526 } else if backoff > maxBackoff.Seconds() { 527 backoff = maxBackoff.Seconds() 528 } 529 530 durationUntilRetry := time.Duration(backoff) * time.Second 531 v.logger.Printf("[INFO] vault: backing off for %v", durationUntilRetry) 532 533 authRenewTimer.Reset(durationUntilRetry) 534 } 535 } 536 } 537 538 // renew attempts to renew our Vault token. If the renewal fails, an error is 539 // returned. This method updates the lastRenewed time 540 func (v *vaultClient) renew() error { 541 // Attempt to renew the token 542 secret, err := v.auth.RenewSelf(v.tokenData.CreationTTL) 543 if err != nil { 544 return err 545 } 546 547 auth := secret.Auth 548 if auth == nil { 549 return fmt.Errorf("renewal successful but not auth information returned") 550 } else if auth.LeaseDuration == 0 { 551 return fmt.Errorf("renewal successful but no lease duration returned") 552 } 553 554 v.lastRenewed = time.Now() 555 v.logger.Printf("[DEBUG] vault: successfully renewed server token") 556 return nil 557 } 558 559 // getWrappingFn returns an appropriate wrapping function for Nomad Servers 560 func (v *vaultClient) getWrappingFn() func(operation, path string) string { 561 createPath := "auth/token/create" 562 role := v.getRole() 563 if role != "" { 564 createPath = fmt.Sprintf("auth/token/create/%s", role) 565 } 566 567 return func(operation, path string) string { 568 // Only wrap the token create operation 569 if operation != "POST" || path != createPath { 570 return "" 571 } 572 573 return vaultTokenCreateTTL 574 } 575 } 576 577 // parseSelfToken looks up the Vault token in Vault and parses its data storing 578 // it in the client. If the token is not valid for Nomads purposes an error is 579 // returned. 580 func (v *vaultClient) parseSelfToken() error { 581 // Get the initial lease duration 582 auth := v.client.Auth().Token() 583 var self *vapi.Secret 584 585 // Try looking up the token using the self endpoint 586 secret, err := auth.LookupSelf() 587 if err != nil { 588 // Try looking up our token directly 589 self, err = auth.Lookup(v.client.Token()) 590 if err != nil { 591 return fmt.Errorf("failed to lookup Vault periodic token: %v", err) 592 } 593 } 594 self = secret 595 596 // Read and parse the fields 597 var data tokenData 598 if err := mapstructure.WeakDecode(self.Data, &data); err != nil { 599 return fmt.Errorf("failed to parse Vault token's data block: %v", err) 600 } 601 602 root := false 603 for _, p := range data.Policies { 604 if p == "root" { 605 root = true 606 break 607 } 608 } 609 610 // Store the token data 611 data.Root = root 612 v.tokenData = &data 613 614 // The criteria that must be met for the token to be valid are as follows: 615 // 1) If token is non-root or is but has a creation ttl 616 // a) The token must be renewable 617 // b) Token must have a non-zero TTL 618 // 2) Must have update capability for "auth/token/lookup/" (used to verify incoming tokens) 619 // 3) Must have update capability for "/auth/token/revoke-accessor/" (used to revoke unneeded tokens) 620 // 4) If configured to create tokens against a role: 621 // a) Must have read capability for "auth/token/roles/<role_name" (Can just attempt a read) 622 // b) Must have update capability for path "auth/token/create/<role_name>" 623 // c) Role must: 624 // 1) Must allow tokens to be renewed 625 // 2) Must not have an explicit max TTL 626 // 3) Must have non-zero period 627 // 5) If not configured against a role, the token must be root 628 629 var mErr multierror.Error 630 role := v.getRole() 631 if !root { 632 // All non-root tokens must be renewable 633 if !data.Renewable { 634 multierror.Append(&mErr, fmt.Errorf("Vault token is not renewable or root")) 635 } 636 637 // All non-root tokens must have a lease duration 638 if data.CreationTTL == 0 { 639 multierror.Append(&mErr, fmt.Errorf("invalid lease duration of zero")) 640 } 641 642 // The lease duration can not be expired 643 if data.TTL == 0 { 644 multierror.Append(&mErr, fmt.Errorf("token TTL is zero")) 645 } 646 647 // There must be a valid role since we aren't root 648 if role == "" { 649 multierror.Append(&mErr, fmt.Errorf("token role name must be set when not using a root token")) 650 } 651 652 } else if data.CreationTTL != 0 { 653 // If the root token has a TTL it must be renewable 654 if !data.Renewable { 655 multierror.Append(&mErr, fmt.Errorf("Vault token has a TTL but is not renewable")) 656 } else if data.TTL == 0 { 657 // If the token has a TTL make sure it has not expired 658 multierror.Append(&mErr, fmt.Errorf("token TTL is zero")) 659 } 660 } 661 662 // Check we have the correct capabilities 663 if err := v.validateCapabilities(role, root); err != nil { 664 multierror.Append(&mErr, err) 665 } 666 667 // If given a role validate it 668 if role != "" { 669 if err := v.validateRole(role); err != nil { 670 multierror.Append(&mErr, err) 671 } 672 } 673 674 return mErr.ErrorOrNil() 675 } 676 677 // getRole returns the role name to be used when creating tokens 678 func (v *vaultClient) getRole() string { 679 if v.config.Role != "" { 680 return v.config.Role 681 } 682 683 return v.tokenData.Role 684 } 685 686 // validateCapabilities checks that Nomad's Vault token has the correct 687 // capabilities. 688 func (v *vaultClient) validateCapabilities(role string, root bool) error { 689 // Check if the token can lookup capabilities. 690 var mErr multierror.Error 691 _, _, err := v.hasCapability(vaultCapabilitiesLookupPath, vaultCapabilitiesCapability) 692 if err != nil { 693 // Check if there is a permission denied 694 if structs.VaultUnrecoverableError.MatchString(err.Error()) { 695 // Since we can't read permissions, we just log a warning that we 696 // can't tell if the Vault token will work 697 msg := fmt.Sprintf("Can not lookup token capabilities. "+ 698 "As such certain operations may fail in the future. "+ 699 "Please give Nomad a Vault token with one of the following "+ 700 "capabilities %q on %q so that the required capabilities can be verified", 701 vaultCapabilitiesCapability, vaultCapabilitiesLookupPath) 702 v.logger.Printf("[WARN] vault: %s", msg) 703 return nil 704 } else { 705 multierror.Append(&mErr, err) 706 } 707 } 708 709 // verify is a helper function that verifies the token has one of the 710 // capabilities on the given path and adds an issue to the error 711 verify := func(path string, requiredCaps []string) { 712 ok, caps, err := v.hasCapability(path, requiredCaps) 713 if err != nil { 714 multierror.Append(&mErr, err) 715 } else if !ok { 716 multierror.Append(&mErr, 717 fmt.Errorf("token must have one of the following capabilities %q on %q; has %v", requiredCaps, path, caps)) 718 } 719 } 720 721 // Check if we are verifying incoming tokens 722 if !v.config.AllowsUnauthenticated() { 723 verify(vaultTokenLookupPath, vaultTokenLookupCapability) 724 } 725 726 // Verify we can renew our selves tokens 727 verify(vaultTokenRenewPath, vaultTokenRenewCapability) 728 729 // Verify we can revoke tokens 730 verify(vaultTokenRevokePath, vaultTokenRevokeCapability) 731 732 // If we are using a role verify the capability 733 if role != "" { 734 // Verify we can read the role 735 verify(fmt.Sprintf(vaultRoleLookupPath, role), vaultRoleLookupCapability) 736 737 // Verify we can create from the role 738 verify(fmt.Sprintf(vaultTokenRoleCreatePath, role), vaultTokenRoleCreateCapability) 739 } 740 741 return mErr.ErrorOrNil() 742 } 743 744 // hasCapability takes a path and returns whether the token has at least one of 745 // the required capabilities on the given path. It also returns the set of 746 // capabilities the token does have as well as any error that occurred. 747 func (v *vaultClient) hasCapability(path string, required []string) (bool, []string, error) { 748 caps, err := v.client.Sys().CapabilitiesSelf(path) 749 if err != nil { 750 return false, nil, err 751 } 752 for _, c := range caps { 753 for _, r := range required { 754 if c == r { 755 return true, caps, nil 756 } 757 } 758 } 759 return false, caps, nil 760 } 761 762 // validateRole contacts Vault and checks that the given Vault role is valid for 763 // the purposes of being used by Nomad 764 func (v *vaultClient) validateRole(role string) error { 765 if role == "" { 766 return fmt.Errorf("Invalid empty role name") 767 } 768 769 // Validate the role 770 rsecret, err := v.client.Logical().Read(fmt.Sprintf("auth/token/roles/%s", role)) 771 if err != nil { 772 return fmt.Errorf("failed to lookup role %q: %v", role, err) 773 } 774 if rsecret == nil { 775 return fmt.Errorf("Role %q does not exist", role) 776 } 777 778 // Read and parse the fields 779 var data struct { 780 ExplicitMaxTtl int `mapstructure:"explicit_max_ttl"` 781 Orphan bool 782 Period int 783 Renewable bool 784 } 785 if err := mapstructure.WeakDecode(rsecret.Data, &data); err != nil { 786 return fmt.Errorf("failed to parse Vault role's data block: %v", err) 787 } 788 789 // Validate the role is acceptable 790 var mErr multierror.Error 791 if !data.Renewable { 792 multierror.Append(&mErr, fmt.Errorf("Role must allow tokens to be renewed")) 793 } 794 795 if data.ExplicitMaxTtl != 0 { 796 multierror.Append(&mErr, fmt.Errorf("Role can not use an explicit max ttl. Token must be periodic.")) 797 } 798 799 if data.Period == 0 { 800 multierror.Append(&mErr, fmt.Errorf("Role must have a non-zero period to make tokens periodic.")) 801 } 802 803 return mErr.ErrorOrNil() 804 } 805 806 // ConnectionEstablished returns whether a connection to Vault has been 807 // established and any error that potentially caused it to be false 808 func (v *vaultClient) ConnectionEstablished() (bool, error) { 809 v.l.Lock() 810 defer v.l.Unlock() 811 return v.connEstablished, v.connEstablishedErr 812 } 813 814 // Enabled returns whether the client is active 815 func (v *vaultClient) Enabled() bool { 816 v.l.Lock() 817 defer v.l.Unlock() 818 return v.config.IsEnabled() 819 } 820 821 // Active returns whether the client is active 822 func (v *vaultClient) Active() bool { 823 return atomic.LoadInt32(&v.active) == 1 824 } 825 826 // CreateToken takes the allocation and task and returns an appropriate Vault 827 // token. The call is rate limited and may be canceled with the passed policy. 828 // When the error is recoverable, it will be of type RecoverableError 829 func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) { 830 if !v.Enabled() { 831 return nil, fmt.Errorf("Vault integration disabled") 832 } 833 if !v.Active() { 834 return nil, structs.NewRecoverableError(fmt.Errorf("Vault client not active"), true) 835 } 836 837 // Check if we have established a connection with Vault 838 if established, err := v.ConnectionEstablished(); !established && err == nil { 839 return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) 840 } else if err != nil { 841 return nil, err 842 } 843 844 // Track how long the request takes 845 defer metrics.MeasureSince([]string{"nomad", "vault", "create_token"}, time.Now()) 846 847 // Retrieve the Vault block for the task 848 policies := a.Job.VaultPolicies() 849 if policies == nil { 850 return nil, fmt.Errorf("Job doesn't require Vault policies") 851 } 852 tg, ok := policies[a.TaskGroup] 853 if !ok { 854 return nil, fmt.Errorf("Task group does not require Vault policies") 855 } 856 taskVault, ok := tg[task] 857 if !ok { 858 return nil, fmt.Errorf("Task does not require Vault policies") 859 } 860 861 // Build the creation request 862 req := &vapi.TokenCreateRequest{ 863 Policies: taskVault.Policies, 864 Metadata: map[string]string{ 865 "AllocationID": a.ID, 866 "Task": task, 867 "NodeID": a.NodeID, 868 }, 869 TTL: v.childTTL, 870 DisplayName: fmt.Sprintf("%s-%s", a.ID, task), 871 } 872 873 // Ensure we are under our rate limit 874 if err := v.limiter.Wait(ctx); err != nil { 875 return nil, err 876 } 877 878 // Make the request and switch depending on whether we are using a root 879 // token or a role based token 880 var secret *vapi.Secret 881 var err error 882 role := v.getRole() 883 if v.tokenData.Root && role == "" { 884 req.Period = v.childTTL 885 secret, err = v.auth.Create(req) 886 } else { 887 // Make the token using the role 888 secret, err = v.auth.CreateWithRole(req, v.getRole()) 889 } 890 891 // Determine whether it is unrecoverable 892 if err != nil { 893 if structs.VaultUnrecoverableError.MatchString(err.Error()) { 894 return secret, err 895 } 896 897 // The error is recoverable 898 return nil, structs.NewRecoverableError(err, true) 899 } 900 901 // Validate the response 902 var validationErr error 903 if secret == nil { 904 validationErr = fmt.Errorf("Vault returned nil Secret") 905 } else if secret.WrapInfo == nil { 906 validationErr = fmt.Errorf("Vault returned Secret with nil WrapInfo. Secret warnings: %v", secret.Warnings) 907 } else if secret.WrapInfo.WrappedAccessor == "" { 908 validationErr = fmt.Errorf("Vault returned WrapInfo without WrappedAccessor. Secret warnings: %v", secret.Warnings) 909 } 910 if validationErr != nil { 911 v.logger.Printf("[WARN] vault: failed to CreateToken: %v", err) 912 return nil, structs.NewRecoverableError(validationErr, true) 913 } 914 915 // Got a valid response 916 return secret, nil 917 } 918 919 // LookupToken takes a Vault token and does a lookup against Vault. The call is 920 // rate limited and may be canceled with passed context. 921 func (v *vaultClient) LookupToken(ctx context.Context, token string) (*vapi.Secret, error) { 922 if !v.Enabled() { 923 return nil, fmt.Errorf("Vault integration disabled") 924 } 925 926 if !v.Active() { 927 return nil, fmt.Errorf("Vault client not active") 928 } 929 930 // Check if we have established a connection with Vault 931 if established, err := v.ConnectionEstablished(); !established && err == nil { 932 return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) 933 } else if err != nil { 934 return nil, err 935 } 936 937 // Track how long the request takes 938 defer metrics.MeasureSince([]string{"nomad", "vault", "lookup_token"}, time.Now()) 939 940 // Ensure we are under our rate limit 941 if err := v.limiter.Wait(ctx); err != nil { 942 return nil, err 943 } 944 945 // Lookup the token 946 return v.auth.Lookup(token) 947 } 948 949 // PoliciesFrom parses the set of policies returned by a token lookup. 950 func PoliciesFrom(s *vapi.Secret) ([]string, error) { 951 if s == nil { 952 return nil, fmt.Errorf("cannot parse nil Vault secret") 953 } 954 var data tokenData 955 if err := mapstructure.WeakDecode(s.Data, &data); err != nil { 956 return nil, fmt.Errorf("failed to parse Vault token's data block: %v", err) 957 } 958 959 return data.Policies, nil 960 } 961 962 // RevokeTokens revokes the passed set of accessors. If committed is set, the 963 // purge function passed to the client is called. If there is an error purging 964 // either because of Vault failures or because of the purge function, the 965 // revocation is retried until the tokens TTL. 966 func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error { 967 if !v.Enabled() { 968 return nil 969 } 970 971 if !v.Active() { 972 return fmt.Errorf("Vault client not active") 973 } 974 975 // Track how long the request takes 976 defer metrics.MeasureSince([]string{"nomad", "vault", "revoke_tokens"}, time.Now()) 977 978 // Check if we have established a connection with Vault. If not just add it 979 // to the queue 980 if established, err := v.ConnectionEstablished(); !established && err == nil { 981 // Only bother tracking it for later revocation if the accessor was 982 // committed 983 if committed { 984 v.storeForRevocation(accessors) 985 } 986 987 // Track that we are abandoning these accessors. 988 metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors))) 989 return nil 990 } 991 992 // Attempt to revoke immediately and if it fails, add it to the revoke queue 993 err := v.parallelRevoke(ctx, accessors) 994 if err != nil { 995 // If it is uncommitted, it is a best effort revoke as it will shortly 996 // TTL within the cubbyhole and has not been leaked to any outside 997 // system 998 if !committed { 999 metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors))) 1000 return nil 1001 } 1002 1003 v.logger.Printf("[WARN] vault: failed to revoke tokens. Will reattempt until TTL: %v", err) 1004 v.storeForRevocation(accessors) 1005 return nil 1006 } else if !committed { 1007 // Mark that it was revoked but there is nothing to purge so exit 1008 metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_revoked"}, float32(len(accessors))) 1009 return nil 1010 } 1011 1012 if err := v.purgeFn(accessors); err != nil { 1013 v.logger.Printf("[ERR] vault: failed to purge Vault accessors: %v", err) 1014 v.storeForRevocation(accessors) 1015 return nil 1016 } 1017 1018 // Track that it was revoked successfully 1019 metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(accessors))) 1020 1021 return nil 1022 } 1023 1024 // storeForRevocation stores the passed set of accessors for revocation. It 1025 // captures their effective TTL by storing their create TTL plus the current 1026 // time. 1027 func (v *vaultClient) storeForRevocation(accessors []*structs.VaultAccessor) { 1028 v.revLock.Lock() 1029 v.statsLock.Lock() 1030 now := time.Now() 1031 for _, a := range accessors { 1032 v.revoking[a] = now.Add(time.Duration(a.CreationTTL) * time.Second) 1033 } 1034 v.stats.TrackedForRevoke = len(v.revoking) 1035 v.statsLock.Unlock() 1036 v.revLock.Unlock() 1037 } 1038 1039 // parallelRevoke revokes the passed VaultAccessors in parallel. 1040 func (v *vaultClient) parallelRevoke(ctx context.Context, accessors []*structs.VaultAccessor) error { 1041 if !v.Enabled() { 1042 return fmt.Errorf("Vault integration disabled") 1043 } 1044 1045 if !v.Active() { 1046 return fmt.Errorf("Vault client not active") 1047 } 1048 1049 // Check if we have established a connection with Vault 1050 if established, err := v.ConnectionEstablished(); !established && err == nil { 1051 return structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true) 1052 } else if err != nil { 1053 return err 1054 } 1055 1056 g, pCtx := errgroup.WithContext(ctx) 1057 1058 // Cap the handlers 1059 handlers := len(accessors) 1060 if handlers > maxParallelRevokes { 1061 handlers = maxParallelRevokes 1062 } 1063 1064 // Create the Vault Tokens 1065 input := make(chan *structs.VaultAccessor, handlers) 1066 for i := 0; i < handlers; i++ { 1067 g.Go(func() error { 1068 for { 1069 select { 1070 case va, ok := <-input: 1071 if !ok { 1072 return nil 1073 } 1074 1075 if err := v.auth.RevokeAccessor(va.Accessor); err != nil { 1076 return fmt.Errorf("failed to revoke token (alloc: %q, node: %q, task: %q): %v", va.AllocID, va.NodeID, va.Task, err) 1077 } 1078 case <-pCtx.Done(): 1079 return nil 1080 } 1081 } 1082 }) 1083 } 1084 1085 // Send the input 1086 go func() { 1087 defer close(input) 1088 for _, va := range accessors { 1089 select { 1090 case <-pCtx.Done(): 1091 return 1092 case input <- va: 1093 } 1094 } 1095 1096 }() 1097 1098 // Wait for everything to complete 1099 return g.Wait() 1100 } 1101 1102 // revokeDaemon should be called in a goroutine and is used to periodically 1103 // revoke Vault accessors that failed the original revocation 1104 func (v *vaultClient) revokeDaemon() { 1105 ticker := time.NewTicker(vaultRevocationIntv) 1106 defer ticker.Stop() 1107 1108 for { 1109 select { 1110 case <-v.tomb.Dying(): 1111 return 1112 case now := <-ticker.C: 1113 if established, _ := v.ConnectionEstablished(); !established { 1114 continue 1115 } 1116 1117 v.revLock.Lock() 1118 1119 // Fast path 1120 if len(v.revoking) == 0 { 1121 v.revLock.Unlock() 1122 continue 1123 } 1124 1125 // Build the list of allocations that need to revoked while pruning any TTL'd checks 1126 revoking := make([]*structs.VaultAccessor, 0, len(v.revoking)) 1127 for va, ttl := range v.revoking { 1128 if now.After(ttl) { 1129 delete(v.revoking, va) 1130 } else { 1131 revoking = append(revoking, va) 1132 } 1133 } 1134 1135 if err := v.parallelRevoke(context.Background(), revoking); err != nil { 1136 v.logger.Printf("[WARN] vault: background token revocation errored: %v", err) 1137 v.revLock.Unlock() 1138 continue 1139 } 1140 1141 // Unlock before a potentially expensive operation 1142 v.revLock.Unlock() 1143 1144 // Call the passed in token revocation function 1145 if err := v.purgeFn(revoking); err != nil { 1146 // Can continue since revocation is idempotent 1147 v.logger.Printf("[ERR] vault: token revocation errored: %v", err) 1148 continue 1149 } 1150 1151 // Track that tokens were revoked successfully 1152 metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(revoking))) 1153 1154 // Can delete from the tracked list now that we have purged 1155 v.revLock.Lock() 1156 v.statsLock.Lock() 1157 for _, va := range revoking { 1158 delete(v.revoking, va) 1159 } 1160 v.stats.TrackedForRevoke = len(v.revoking) 1161 v.statsLock.Unlock() 1162 v.revLock.Unlock() 1163 1164 } 1165 } 1166 } 1167 1168 // purgeVaultAccessors creates a Raft transaction to remove the passed Vault 1169 // Accessors 1170 func (s *Server) purgeVaultAccessors(accessors []*structs.VaultAccessor) error { 1171 // Commit this update via Raft 1172 req := structs.VaultAccessorsRequest{Accessors: accessors} 1173 _, _, err := s.raftApply(structs.VaultAccessorDeregisterRequestType, req) 1174 return err 1175 } 1176 1177 // wrapNilError is a helper that returns a wrapped function that returns a nil 1178 // error 1179 func wrapNilError(f func()) func() error { 1180 return func() error { 1181 f() 1182 return nil 1183 } 1184 } 1185 1186 // setLimit is used to update the rate limit 1187 func (v *vaultClient) setLimit(l rate.Limit) { 1188 v.l.Lock() 1189 defer v.l.Unlock() 1190 v.limiter = rate.NewLimiter(l, int(l)) 1191 } 1192 1193 // Stats is used to query the state of the blocked eval tracker. 1194 func (v *vaultClient) Stats() *VaultStats { 1195 // Allocate a new stats struct 1196 stats := new(VaultStats) 1197 1198 v.statsLock.RLock() 1199 defer v.statsLock.RUnlock() 1200 1201 // Copy all the stats 1202 stats.TrackedForRevoke = v.stats.TrackedForRevoke 1203 1204 return stats 1205 } 1206 1207 // EmitStats is used to export metrics about the blocked eval tracker while enabled 1208 func (v *vaultClient) EmitStats(period time.Duration, stopCh chan struct{}) { 1209 for { 1210 select { 1211 case <-time.After(period): 1212 stats := v.Stats() 1213 metrics.SetGauge([]string{"nomad", "vault", "distributed_tokens_revoking"}, float32(stats.TrackedForRevoke)) 1214 case <-stopCh: 1215 return 1216 } 1217 } 1218 }