github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/nomad/vault.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"log"
     8  	"math/rand"
     9  	"regexp"
    10  	"sync"
    11  	"sync/atomic"
    12  	"time"
    13  
    14  	"gopkg.in/tomb.v2"
    15  
    16  	metrics "github.com/armon/go-metrics"
    17  	multierror "github.com/hashicorp/go-multierror"
    18  	"github.com/hashicorp/nomad/nomad/structs"
    19  	"github.com/hashicorp/nomad/nomad/structs/config"
    20  	vapi "github.com/hashicorp/vault/api"
    21  	"github.com/mitchellh/mapstructure"
    22  
    23  	"golang.org/x/sync/errgroup"
    24  	"golang.org/x/time/rate"
    25  )
    26  
    27  const (
    28  	// vaultTokenCreateTTL is the duration the wrapped token for the client is
    29  	// valid for. The units are in seconds.
    30  	vaultTokenCreateTTL = "60s"
    31  
    32  	// minimumTokenTTL is the minimum Token TTL allowed for child tokens.
    33  	minimumTokenTTL = 5 * time.Minute
    34  
    35  	// defaultTokenTTL is the default Token TTL used when the passed token is a
    36  	// root token such that child tokens aren't being created against a role
    37  	// that has defined a TTL
    38  	defaultTokenTTL = "72h"
    39  
    40  	// requestRateLimit is the maximum number of requests per second Nomad will
    41  	// make against Vault
    42  	requestRateLimit rate.Limit = 500.0
    43  
    44  	// maxParallelRevokes is the maximum number of parallel Vault
    45  	// token revocation requests
    46  	maxParallelRevokes = 64
    47  
    48  	// vaultRevocationIntv is the interval at which Vault tokens that failed
    49  	// initial revocation are retried
    50  	vaultRevocationIntv = 5 * time.Minute
    51  
    52  	// vaultCapabilitiesLookupPath is the path to lookup the capabilities of
    53  	// ones token.
    54  	vaultCapabilitiesLookupPath = "sys/capabilities-self"
    55  
    56  	// vaultTokenRenewPath is the path used to renew our token
    57  	vaultTokenRenewPath = "auth/token/renew-self"
    58  
    59  	// vaultTokenLookupPath is the path used to lookup a token
    60  	vaultTokenLookupPath = "auth/token/lookup"
    61  
    62  	// vaultTokenLookupSelfPath is the path used to lookup self token
    63  	vaultTokenLookupSelfPath = "auth/token/lookup-self"
    64  
    65  	// vaultTokenRevokePath is the path used to revoke a token
    66  	vaultTokenRevokePath = "auth/token/revoke-accessor"
    67  
    68  	// vaultRoleLookupPath is the path to lookup a role
    69  	vaultRoleLookupPath = "auth/token/roles/%s"
    70  
    71  	// vaultRoleCreatePath is the path to create a token from a role
    72  	vaultTokenRoleCreatePath = "auth/token/create/%s"
    73  )
    74  
    75  var (
    76  	// vaultUnrecoverableError matches unrecoverable errors
    77  	vaultUnrecoverableError = regexp.MustCompile(`Code:\s+40(0|3|4)`)
    78  
    79  	// vaultCapabilitiesCapability is the expected capability of Nomad's Vault
    80  	// token on the the path. The token must have at least one of the
    81  	// capabilities.
    82  	vaultCapabilitiesCapability = []string{"update", "root"}
    83  
    84  	// vaultTokenRenewCapability is the expected capability Nomad's
    85  	// Vault token should have on the path. The token must have at least one of
    86  	// the capabilities.
    87  	vaultTokenRenewCapability = []string{"update", "root"}
    88  
    89  	// vaultTokenLookupCapability is the expected capability Nomad's
    90  	// Vault token should have on the path. The token must have at least one of
    91  	// the capabilities.
    92  	vaultTokenLookupCapability = []string{"update", "root"}
    93  
    94  	// vaultTokenLookupSelfCapability is the expected capability Nomad's
    95  	// Vault token should have on the path. The token must have at least one of
    96  	// the capabilities.
    97  	vaultTokenLookupSelfCapability = []string{"update", "root"}
    98  
    99  	// vaultTokenRevokeCapability is the expected capability Nomad's
   100  	// Vault token should have on the path. The token must have at least one of
   101  	// the capabilities.
   102  	vaultTokenRevokeCapability = []string{"update", "root"}
   103  
   104  	// vaultRoleLookupCapability is the the expected capability Nomad's Vault
   105  	// token should have on the path. The token must have at least one of the
   106  	// capabilities.
   107  	vaultRoleLookupCapability = []string{"read", "root"}
   108  
   109  	// vaultTokenRoleCreateCapability is the the expected capability Nomad's Vault
   110  	// token should have on the path. The token must have at least one of the
   111  	// capabilities.
   112  	vaultTokenRoleCreateCapability = []string{"update", "root"}
   113  )
   114  
   115  // VaultClient is the Servers interface for interfacing with Vault
   116  type VaultClient interface {
   117  	// SetActive activates or de-activates the Vault client. When active, token
   118  	// creation/lookup/revocation operation are allowed.
   119  	SetActive(active bool)
   120  
   121  	// SetConfig updates the config used by the Vault client
   122  	SetConfig(config *config.VaultConfig) error
   123  
   124  	// CreateToken takes an allocation and task and returns an appropriate Vault
   125  	// Secret
   126  	CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error)
   127  
   128  	// LookupToken takes a token string and returns its capabilities.
   129  	LookupToken(ctx context.Context, token string) (*vapi.Secret, error)
   130  
   131  	// RevokeTokens takes a set of tokens accessor and revokes the tokens
   132  	RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error
   133  
   134  	// Stop is used to stop token renewal
   135  	Stop()
   136  
   137  	// Running returns whether the Vault client is running
   138  	Running() bool
   139  
   140  	// Stats returns the Vault clients statistics
   141  	Stats() *VaultStats
   142  
   143  	// EmitStats emits that clients statistics at the given period until stopCh
   144  	// is called.
   145  	EmitStats(period time.Duration, stopCh chan struct{})
   146  }
   147  
   148  // VaultStats returns all the stats about Vault tokens created and managed by
   149  // Nomad.
   150  type VaultStats struct {
   151  	// TrackedForRevoke is the count of tokens that are being tracked to be
   152  	// revoked since they could not be immediately revoked.
   153  	TrackedForRevoke int
   154  }
   155  
   156  // PurgeVaultAccessor is called to remove VaultAccessors from the system. If
   157  // the function returns an error, the token will still be tracked and revocation
   158  // will retry till there is a success
   159  type PurgeVaultAccessorFn func(accessors []*structs.VaultAccessor) error
   160  
   161  // tokenData holds the relevant information about the Vault token passed to the
   162  // client.
   163  type tokenData struct {
   164  	CreationTTL int      `mapstructure:"creation_ttl"`
   165  	TTL         int      `mapstructure:"ttl"`
   166  	Renewable   bool     `mapstructure:"renewable"`
   167  	Policies    []string `mapstructure:"policies"`
   168  	Role        string   `mapstructure:"role"`
   169  	Root        bool
   170  }
   171  
   172  // vaultClient is the Servers implementation of the VaultClient interface. The
   173  // client renews the PeriodicToken given in the Vault configuration and provides
   174  // the Server with the ability to create child tokens and lookup the permissions
   175  // of tokens.
   176  type vaultClient struct {
   177  	// limiter is used to rate limit requests to Vault
   178  	limiter *rate.Limiter
   179  
   180  	// client is the Vault API client
   181  	client *vapi.Client
   182  
   183  	// auth is the Vault token auth API client
   184  	auth *vapi.TokenAuth
   185  
   186  	// config is the user passed Vault config
   187  	config *config.VaultConfig
   188  
   189  	// connEstablished marks whether we have an established connection to Vault.
   190  	connEstablished bool
   191  
   192  	// connEstablishedErr marks an error that can occur when establishing a
   193  	// connection
   194  	connEstablishedErr error
   195  
   196  	// token is the raw token used by the client
   197  	token string
   198  
   199  	// tokenData is the data of the passed Vault token
   200  	tokenData *tokenData
   201  
   202  	// revoking tracks the VaultAccessors that must be revoked
   203  	revoking map[*structs.VaultAccessor]time.Time
   204  	purgeFn  PurgeVaultAccessorFn
   205  	revLock  sync.Mutex
   206  
   207  	// active indicates whether the vaultClient is active. It should be
   208  	// accessed using a helper and updated atomically
   209  	active int32
   210  
   211  	// running indicates whether the vault client is started.
   212  	running bool
   213  
   214  	// childTTL is the TTL for child tokens.
   215  	childTTL string
   216  
   217  	// lastRenewed is the time the token was last renewed
   218  	lastRenewed time.Time
   219  
   220  	tomb   *tomb.Tomb
   221  	logger *log.Logger
   222  
   223  	// stats stores the stats
   224  	stats     *VaultStats
   225  	statsLock sync.RWMutex
   226  
   227  	// l is used to lock the configuration aspects of the client such that
   228  	// multiple callers can't cause conflicting config updates
   229  	l sync.Mutex
   230  }
   231  
   232  // NewVaultClient returns a Vault client from the given config. If the client
   233  // couldn't be made an error is returned.
   234  func NewVaultClient(c *config.VaultConfig, logger *log.Logger, purgeFn PurgeVaultAccessorFn) (*vaultClient, error) {
   235  	if c == nil {
   236  		return nil, fmt.Errorf("must pass valid VaultConfig")
   237  	}
   238  
   239  	if logger == nil {
   240  		return nil, fmt.Errorf("must pass valid logger")
   241  	}
   242  
   243  	v := &vaultClient{
   244  		config:   c,
   245  		logger:   logger,
   246  		limiter:  rate.NewLimiter(requestRateLimit, int(requestRateLimit)),
   247  		revoking: make(map[*structs.VaultAccessor]time.Time),
   248  		purgeFn:  purgeFn,
   249  		tomb:     &tomb.Tomb{},
   250  		stats:    new(VaultStats),
   251  	}
   252  
   253  	if v.config.IsEnabled() {
   254  		if err := v.buildClient(); err != nil {
   255  			return nil, err
   256  		}
   257  
   258  		// Launch the required goroutines
   259  		v.tomb.Go(wrapNilError(v.establishConnection))
   260  		v.tomb.Go(wrapNilError(v.revokeDaemon))
   261  
   262  		v.running = true
   263  	}
   264  
   265  	return v, nil
   266  }
   267  
   268  func (v *vaultClient) Stop() {
   269  	v.l.Lock()
   270  	running := v.running
   271  	v.running = false
   272  	v.l.Unlock()
   273  
   274  	if running {
   275  		v.tomb.Kill(nil)
   276  		v.tomb.Wait()
   277  		v.flush()
   278  	}
   279  }
   280  
   281  func (v *vaultClient) Running() bool {
   282  	v.l.Lock()
   283  	defer v.l.Unlock()
   284  	return v.running
   285  }
   286  
   287  // SetActive activates or de-activates the Vault client. When active, token
   288  // creation/lookup/revocation operation are allowed. All queued revocations are
   289  // cancelled if set un-active as it is assumed another instances is taking over
   290  func (v *vaultClient) SetActive(active bool) {
   291  	if active {
   292  		atomic.StoreInt32(&v.active, 1)
   293  	} else {
   294  		atomic.StoreInt32(&v.active, 0)
   295  	}
   296  
   297  	// Clear out the revoking tokens
   298  	v.revLock.Lock()
   299  	v.revoking = make(map[*structs.VaultAccessor]time.Time)
   300  	v.revLock.Unlock()
   301  
   302  	return
   303  }
   304  
   305  // flush is used to reset the state of the vault client
   306  func (v *vaultClient) flush() {
   307  	v.l.Lock()
   308  	defer v.l.Unlock()
   309  
   310  	v.client = nil
   311  	v.auth = nil
   312  	v.connEstablished = false
   313  	v.connEstablishedErr = nil
   314  	v.token = ""
   315  	v.tokenData = nil
   316  	v.revoking = make(map[*structs.VaultAccessor]time.Time)
   317  	v.childTTL = ""
   318  	v.tomb = &tomb.Tomb{}
   319  }
   320  
   321  // SetConfig is used to update the Vault config being used. A temporary outage
   322  // may occur after calling as it re-establishes a connection to Vault
   323  func (v *vaultClient) SetConfig(config *config.VaultConfig) error {
   324  	if config == nil {
   325  		return fmt.Errorf("must pass valid VaultConfig")
   326  	}
   327  
   328  	v.l.Lock()
   329  	defer v.l.Unlock()
   330  
   331  	// Kill any background routintes
   332  	if v.running {
   333  		// Stop accepting any new request
   334  		v.connEstablished = false
   335  
   336  		// Kill any background routine and create a new tomb
   337  		v.tomb.Kill(nil)
   338  		v.tomb.Wait()
   339  		v.tomb = &tomb.Tomb{}
   340  		v.running = false
   341  	}
   342  
   343  	// Store the new config
   344  	v.config = config
   345  
   346  	// Check if we should relaunch
   347  	if v.config.IsEnabled() {
   348  		// Rebuild the client
   349  		if err := v.buildClient(); err != nil {
   350  			return err
   351  		}
   352  
   353  		// Launch the required goroutines
   354  		v.tomb.Go(wrapNilError(v.establishConnection))
   355  		v.tomb.Go(wrapNilError(v.revokeDaemon))
   356  		v.running = true
   357  	}
   358  
   359  	return nil
   360  }
   361  
   362  // buildClient is used to build a Vault client based on the stored Vault config
   363  func (v *vaultClient) buildClient() error {
   364  	// Validate we have the required fields.
   365  	if v.config.Token == "" {
   366  		return errors.New("Vault token must be set")
   367  	} else if v.config.Addr == "" {
   368  		return errors.New("Vault address must be set")
   369  	}
   370  
   371  	// Parse the TTL if it is set
   372  	if v.config.TaskTokenTTL != "" {
   373  		d, err := time.ParseDuration(v.config.TaskTokenTTL)
   374  		if err != nil {
   375  			return fmt.Errorf("failed to parse TaskTokenTTL %q: %v", v.config.TaskTokenTTL, err)
   376  		}
   377  
   378  		if d.Nanoseconds() < minimumTokenTTL.Nanoseconds() {
   379  			return fmt.Errorf("ChildTokenTTL is less than minimum allowed of %v", minimumTokenTTL)
   380  		}
   381  
   382  		v.childTTL = v.config.TaskTokenTTL
   383  	} else {
   384  		// Default the TaskTokenTTL
   385  		v.childTTL = defaultTokenTTL
   386  	}
   387  
   388  	// Get the Vault API configuration
   389  	apiConf, err := v.config.ApiConfig()
   390  	if err != nil {
   391  		return fmt.Errorf("Failed to create Vault API config: %v", err)
   392  	}
   393  
   394  	// Create the Vault API client
   395  	client, err := vapi.NewClient(apiConf)
   396  	if err != nil {
   397  		v.logger.Printf("[ERR] vault: failed to create Vault client. Not retrying: %v", err)
   398  		return err
   399  	}
   400  
   401  	// Set the token and store the client
   402  	v.token = v.config.Token
   403  	client.SetToken(v.token)
   404  	v.client = client
   405  	v.auth = client.Auth().Token()
   406  	return nil
   407  }
   408  
   409  // establishConnection is used to make first contact with Vault. This should be
   410  // called in a go-routine since the connection is retried til the Vault Client
   411  // is stopped or the connection is successfully made at which point the renew
   412  // loop is started.
   413  func (v *vaultClient) establishConnection() {
   414  	// Create the retry timer and set initial duration to zero so it fires
   415  	// immediately
   416  	retryTimer := time.NewTimer(0)
   417  
   418  OUTER:
   419  	for {
   420  		select {
   421  		case <-v.tomb.Dying():
   422  			return
   423  		case <-retryTimer.C:
   424  			// Ensure the API is reachable
   425  			if _, err := v.client.Sys().InitStatus(); err != nil {
   426  				v.logger.Printf("[WARN] vault: failed to contact Vault API. Retrying in %v: %v",
   427  					v.config.ConnectionRetryIntv, err)
   428  				retryTimer.Reset(v.config.ConnectionRetryIntv)
   429  				continue OUTER
   430  			}
   431  
   432  			break OUTER
   433  		}
   434  	}
   435  
   436  	// Retrieve our token, validate it and parse the lease duration
   437  	if err := v.parseSelfToken(); err != nil {
   438  		v.logger.Printf("[ERR] vault: failed to validate self token/role and not retrying: %v", err)
   439  		v.l.Lock()
   440  		v.connEstablished = false
   441  		v.connEstablishedErr = err
   442  		v.l.Unlock()
   443  		return
   444  	}
   445  
   446  	// Set the wrapping function such that token creation is wrapped now
   447  	// that we know our role
   448  	v.client.SetWrappingLookupFunc(v.getWrappingFn())
   449  
   450  	// If we are given a non-root token, start renewing it
   451  	if v.tokenData.Root && v.tokenData.CreationTTL == 0 {
   452  		v.logger.Printf("[DEBUG] vault: not renewing token as it is root")
   453  	} else {
   454  		v.logger.Printf("[DEBUG] vault: token lease duration is %v",
   455  			time.Duration(v.tokenData.CreationTTL)*time.Second)
   456  		v.tomb.Go(wrapNilError(v.renewalLoop))
   457  	}
   458  
   459  	v.l.Lock()
   460  	v.connEstablished = true
   461  	v.connEstablishedErr = nil
   462  	v.l.Unlock()
   463  }
   464  
   465  // renewalLoop runs the renew loop. This should only be called if we are given a
   466  // non-root token.
   467  func (v *vaultClient) renewalLoop() {
   468  	// Create the renewal timer and set initial duration to zero so it fires
   469  	// immediately
   470  	authRenewTimer := time.NewTimer(0)
   471  
   472  	// Backoff is to reduce the rate we try to renew with Vault under error
   473  	// situations
   474  	backoff := 0.0
   475  
   476  	for {
   477  		select {
   478  		case <-v.tomb.Dying():
   479  			return
   480  		case <-authRenewTimer.C:
   481  			// Renew the token and determine the new expiration
   482  			err := v.renew()
   483  			currentExpiration := v.lastRenewed.Add(time.Duration(v.tokenData.CreationTTL) * time.Second)
   484  
   485  			// Successfully renewed
   486  			if err == nil {
   487  				// If we take the expiration (lastRenewed + auth duration) and
   488  				// subtract the current time, we get a duration until expiry.
   489  				// Set the timer to poke us after half of that time is up.
   490  				durationUntilRenew := currentExpiration.Sub(time.Now()) / 2
   491  
   492  				v.logger.Printf("[INFO] vault: renewing token in %v", durationUntilRenew)
   493  				authRenewTimer.Reset(durationUntilRenew)
   494  
   495  				// Reset any backoff
   496  				backoff = 0
   497  				break
   498  			}
   499  
   500  			// Back off, increasing the amount of backoff each time. There are some rules:
   501  			//
   502  			// * If we have an existing authentication that is going to expire,
   503  			// never back off more than half of the amount of time remaining
   504  			// until expiration
   505  			// * Never back off more than 30 seconds multiplied by a random
   506  			// value between 1 and 2
   507  			// * Use randomness so that many clients won't keep hitting Vault
   508  			// at the same time
   509  
   510  			// Set base values and add some backoff
   511  
   512  			v.logger.Printf("[WARN] vault: got error or bad auth, so backing off: %v", err)
   513  			switch {
   514  			case backoff < 5:
   515  				backoff = 5
   516  			case backoff >= 24:
   517  				backoff = 30
   518  			default:
   519  				backoff = backoff * 1.25
   520  			}
   521  
   522  			// Add randomness
   523  			backoff = backoff * (1.0 + rand.Float64())
   524  
   525  			maxBackoff := currentExpiration.Sub(time.Now()) / 2
   526  			if maxBackoff < 0 {
   527  				// We have failed to renew the token past its expiration. Stop
   528  				// renewing with Vault.
   529  				v.logger.Printf("[ERR] vault: failed to renew Vault token before lease expiration. Shutting down Vault client")
   530  				v.l.Lock()
   531  				v.connEstablished = false
   532  				v.connEstablishedErr = err
   533  				v.l.Unlock()
   534  				return
   535  
   536  			} else if backoff > maxBackoff.Seconds() {
   537  				backoff = maxBackoff.Seconds()
   538  			}
   539  
   540  			durationUntilRetry := time.Duration(backoff) * time.Second
   541  			v.logger.Printf("[INFO] vault: backing off for %v", durationUntilRetry)
   542  
   543  			authRenewTimer.Reset(durationUntilRetry)
   544  		}
   545  	}
   546  }
   547  
   548  // renew attempts to renew our Vault token. If the renewal fails, an error is
   549  // returned. This method updates the lastRenewed time
   550  func (v *vaultClient) renew() error {
   551  	// Attempt to renew the token
   552  	secret, err := v.auth.RenewSelf(v.tokenData.CreationTTL)
   553  	if err != nil {
   554  		return err
   555  	}
   556  
   557  	auth := secret.Auth
   558  	if auth == nil {
   559  		return fmt.Errorf("renewal successful but not auth information returned")
   560  	} else if auth.LeaseDuration == 0 {
   561  		return fmt.Errorf("renewal successful but no lease duration returned")
   562  	}
   563  
   564  	v.lastRenewed = time.Now()
   565  	v.logger.Printf("[DEBUG] vault: succesfully renewed server token")
   566  	return nil
   567  }
   568  
   569  // getWrappingFn returns an appropriate wrapping function for Nomad Servers
   570  func (v *vaultClient) getWrappingFn() func(operation, path string) string {
   571  	createPath := "auth/token/create"
   572  	role := v.getRole()
   573  	if role != "" {
   574  		createPath = fmt.Sprintf("auth/token/create/%s", role)
   575  	}
   576  
   577  	return func(operation, path string) string {
   578  		// Only wrap the token create operation
   579  		if operation != "POST" || path != createPath {
   580  			return ""
   581  		}
   582  
   583  		return vaultTokenCreateTTL
   584  	}
   585  }
   586  
   587  // parseSelfToken looks up the Vault token in Vault and parses its data storing
   588  // it in the client. If the token is not valid for Nomads purposes an error is
   589  // returned.
   590  func (v *vaultClient) parseSelfToken() error {
   591  	// Get the initial lease duration
   592  	auth := v.client.Auth().Token()
   593  	var self *vapi.Secret
   594  
   595  	// Try looking up the token using the self endpoint
   596  	secret, err := auth.LookupSelf()
   597  	if err != nil {
   598  		// Try looking up our token directly
   599  		self, err = auth.Lookup(v.client.Token())
   600  		if err != nil {
   601  			return fmt.Errorf("failed to lookup Vault periodic token: %v", err)
   602  		}
   603  	}
   604  	self = secret
   605  
   606  	// Read and parse the fields
   607  	var data tokenData
   608  	if err := mapstructure.WeakDecode(self.Data, &data); err != nil {
   609  		return fmt.Errorf("failed to parse Vault token's data block: %v", err)
   610  	}
   611  
   612  	root := false
   613  	for _, p := range data.Policies {
   614  		if p == "root" {
   615  			root = true
   616  			break
   617  		}
   618  	}
   619  
   620  	// Store the token data
   621  	data.Root = root
   622  	v.tokenData = &data
   623  
   624  	// The criteria that must be met for the token to be valid are as follows:
   625  	// 1) If token is non-root or is but has a creation ttl
   626  	//   a) The token must be renewable
   627  	//   b) Token must have a non-zero TTL
   628  	// 2) Must have update capability for "auth/token/lookup/" (used to verify incoming tokens)
   629  	// 3) Must have update capability for "/auth/token/revoke-accessor/" (used to revoke unneeded tokens)
   630  	// 4) If configured to create tokens against a role:
   631  	//   a) Must have read capability for "auth/token/roles/<role_name" (Can just attemp a read)
   632  	//   b) Must have update capability for path "auth/token/create/<role_name>"
   633  	//   c) Role must:
   634  	//     1) Not allow orphans
   635  	//     2) Must allow tokens to be renewed
   636  	//     3) Must not have an explicit max TTL
   637  	//     4) Must have non-zero period
   638  	// 5) If not configured against a role, the token must be root
   639  
   640  	var mErr multierror.Error
   641  	role := v.getRole()
   642  	if !root {
   643  		// All non-root tokens must be renewable
   644  		if !data.Renewable {
   645  			multierror.Append(&mErr, fmt.Errorf("Vault token is not renewable or root"))
   646  		}
   647  
   648  		// All non-root tokens must have a lease duration
   649  		if data.CreationTTL == 0 {
   650  			multierror.Append(&mErr, fmt.Errorf("invalid lease duration of zero"))
   651  		}
   652  
   653  		// The lease duration can not be expired
   654  		if data.TTL == 0 {
   655  			multierror.Append(&mErr, fmt.Errorf("token TTL is zero"))
   656  		}
   657  
   658  		// There must be a valid role since we aren't root
   659  		if role == "" {
   660  			multierror.Append(&mErr, fmt.Errorf("token role name must be set when not using a root token"))
   661  		}
   662  
   663  	} else if data.CreationTTL != 0 {
   664  		// If the root token has a TTL it must be renewable
   665  		if !data.Renewable {
   666  			multierror.Append(&mErr, fmt.Errorf("Vault token has a TTL but is not renewable"))
   667  		} else if data.TTL == 0 {
   668  			// If the token has a TTL make sure it has not expired
   669  			multierror.Append(&mErr, fmt.Errorf("token TTL is zero"))
   670  		}
   671  	}
   672  
   673  	// Check we have the correct capabilities
   674  	if err := v.validateCapabilities(role, root); err != nil {
   675  		multierror.Append(&mErr, err)
   676  	}
   677  
   678  	// If given a role validate it
   679  	if role != "" {
   680  		if err := v.validateRole(role); err != nil {
   681  			multierror.Append(&mErr, err)
   682  		}
   683  	}
   684  
   685  	return mErr.ErrorOrNil()
   686  }
   687  
   688  // getRole returns the role name to be used when creating tokens
   689  func (v *vaultClient) getRole() string {
   690  	if v.config.Role != "" {
   691  		return v.config.Role
   692  	}
   693  
   694  	return v.tokenData.Role
   695  }
   696  
   697  // validateCapabilities checks that Nomad's Vault token has the correct
   698  // capabilities.
   699  func (v *vaultClient) validateCapabilities(role string, root bool) error {
   700  	// Check if the token can lookup capabilities.
   701  	var mErr multierror.Error
   702  	_, _, err := v.hasCapability(vaultCapabilitiesLookupPath, vaultCapabilitiesCapability)
   703  	if err != nil {
   704  		// Check if there is a permission denied
   705  		if vaultUnrecoverableError.MatchString(err.Error()) {
   706  			// Since we can't read permissions, we just log a warning that we
   707  			// can't tell if the Vault token will work
   708  			msg := fmt.Sprintf("Can not lookup token capabilities. "+
   709  				"As such certain operations may fail in the future. "+
   710  				"Please give Nomad a Vault token with one of the following "+
   711  				"capabilities %q on %q so that the required capabilities can be verified",
   712  				vaultCapabilitiesCapability, vaultCapabilitiesLookupPath)
   713  			v.logger.Printf("[WARN] vault: %s", msg)
   714  			return nil
   715  		} else {
   716  			multierror.Append(&mErr, err)
   717  		}
   718  	}
   719  
   720  	// verify is a helper function that verifies the token has one of the
   721  	// capabilities on the given path and adds an issue to the error
   722  	verify := func(path string, requiredCaps []string) {
   723  		ok, caps, err := v.hasCapability(path, requiredCaps)
   724  		if err != nil {
   725  			multierror.Append(&mErr, err)
   726  		} else if !ok {
   727  			multierror.Append(&mErr,
   728  				fmt.Errorf("token must have one of the following capabilities %q on %q; has %v", requiredCaps, path, caps))
   729  		}
   730  	}
   731  
   732  	// Check if we are verifying incoming tokens
   733  	if !v.config.AllowsUnauthenticated() {
   734  		verify(vaultTokenLookupPath, vaultTokenLookupCapability)
   735  	}
   736  
   737  	// Verify we can renew our selves tokens
   738  	verify(vaultTokenRenewPath, vaultTokenRenewCapability)
   739  
   740  	// Verify we can revoke tokens
   741  	verify(vaultTokenRevokePath, vaultTokenRevokeCapability)
   742  
   743  	// If we are using a role verify the capability
   744  	if role != "" {
   745  		// Verify we can read the role
   746  		verify(fmt.Sprintf(vaultRoleLookupPath, role), vaultRoleLookupCapability)
   747  
   748  		// Verify we can create from the role
   749  		verify(fmt.Sprintf(vaultTokenRoleCreatePath, role), vaultTokenRoleCreateCapability)
   750  	}
   751  
   752  	return mErr.ErrorOrNil()
   753  }
   754  
   755  // hasCapability takes a path and returns whether the token has at least one of
   756  // the required capabilities on the given path. It also returns the set of
   757  // capabilities the token does have as well as any error that occured.
   758  func (v *vaultClient) hasCapability(path string, required []string) (bool, []string, error) {
   759  	caps, err := v.client.Sys().CapabilitiesSelf(path)
   760  	if err != nil {
   761  		return false, nil, err
   762  	}
   763  	for _, c := range caps {
   764  		for _, r := range required {
   765  			if c == r {
   766  				return true, caps, nil
   767  			}
   768  		}
   769  	}
   770  	return false, caps, nil
   771  }
   772  
   773  // validateRole contacts Vault and checks that the given Vault role is valid for
   774  // the purposes of being used by Nomad
   775  func (v *vaultClient) validateRole(role string) error {
   776  	if role == "" {
   777  		return fmt.Errorf("Invalid empty role name")
   778  	}
   779  
   780  	// Validate the role
   781  	rsecret, err := v.client.Logical().Read(fmt.Sprintf("auth/token/roles/%s", role))
   782  	if err != nil {
   783  		return fmt.Errorf("failed to lookup role %q: %v", role, err)
   784  	}
   785  
   786  	// Read and parse the fields
   787  	var data struct {
   788  		ExplicitMaxTtl int `mapstructure:"explicit_max_ttl"`
   789  		Orphan         bool
   790  		Period         int
   791  		Renewable      bool
   792  	}
   793  	if err := mapstructure.WeakDecode(rsecret.Data, &data); err != nil {
   794  		return fmt.Errorf("failed to parse Vault role's data block: %v", err)
   795  	}
   796  
   797  	// Validate the role is acceptable
   798  	var mErr multierror.Error
   799  	if data.Orphan {
   800  		multierror.Append(&mErr, fmt.Errorf("Role must not allow orphans"))
   801  	}
   802  
   803  	if !data.Renewable {
   804  		multierror.Append(&mErr, fmt.Errorf("Role must allow tokens to be renewed"))
   805  	}
   806  
   807  	if data.ExplicitMaxTtl != 0 {
   808  		multierror.Append(&mErr, fmt.Errorf("Role can not use an explicit max ttl. Token must be periodic."))
   809  	}
   810  
   811  	if data.Period == 0 {
   812  		multierror.Append(&mErr, fmt.Errorf("Role must have a non-zero period to make tokens periodic."))
   813  	}
   814  
   815  	return mErr.ErrorOrNil()
   816  }
   817  
   818  // ConnectionEstablished returns whether a connection to Vault has been
   819  // established and any error that potentially caused it to be false
   820  func (v *vaultClient) ConnectionEstablished() (bool, error) {
   821  	v.l.Lock()
   822  	defer v.l.Unlock()
   823  	return v.connEstablished, v.connEstablishedErr
   824  }
   825  
   826  // Enabled returns whether the client is active
   827  func (v *vaultClient) Enabled() bool {
   828  	v.l.Lock()
   829  	defer v.l.Unlock()
   830  	return v.config.IsEnabled()
   831  }
   832  
   833  // Active returns whether the client is active
   834  func (v *vaultClient) Active() bool {
   835  	return atomic.LoadInt32(&v.active) == 1
   836  }
   837  
   838  // CreateToken takes the allocation and task and returns an appropriate Vault
   839  // token. The call is rate limited and may be canceled with the passed policy.
   840  // When the error is recoverable, it will be of type RecoverableError
   841  func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) {
   842  	if !v.Enabled() {
   843  		return nil, fmt.Errorf("Vault integration disabled")
   844  	}
   845  	if !v.Active() {
   846  		return nil, structs.NewRecoverableError(fmt.Errorf("Vault client not active"), true)
   847  	}
   848  
   849  	// Check if we have established a connection with Vault
   850  	if established, err := v.ConnectionEstablished(); !established && err == nil {
   851  		return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
   852  	} else if !established {
   853  		return nil, fmt.Errorf("Connection to Vault failed: %v", err)
   854  	}
   855  
   856  	// Track how long the request takes
   857  	defer metrics.MeasureSince([]string{"nomad", "vault", "create_token"}, time.Now())
   858  
   859  	// Retrieve the Vault block for the task
   860  	policies := a.Job.VaultPolicies()
   861  	if policies == nil {
   862  		return nil, fmt.Errorf("Job doesn't require Vault policies")
   863  	}
   864  	tg, ok := policies[a.TaskGroup]
   865  	if !ok {
   866  		return nil, fmt.Errorf("Task group does not require Vault policies")
   867  	}
   868  	taskVault, ok := tg[task]
   869  	if !ok {
   870  		return nil, fmt.Errorf("Task does not require Vault policies")
   871  	}
   872  
   873  	// Build the creation request
   874  	req := &vapi.TokenCreateRequest{
   875  		Policies: taskVault.Policies,
   876  		Metadata: map[string]string{
   877  			"AllocationID": a.ID,
   878  			"Task":         task,
   879  			"NodeID":       a.NodeID,
   880  		},
   881  		TTL:         v.childTTL,
   882  		DisplayName: fmt.Sprintf("%s-%s", a.ID, task),
   883  	}
   884  
   885  	// Ensure we are under our rate limit
   886  	if err := v.limiter.Wait(ctx); err != nil {
   887  		return nil, err
   888  	}
   889  
   890  	// Make the request and switch depending on whether we are using a root
   891  	// token or a role based token
   892  	var secret *vapi.Secret
   893  	var err error
   894  	role := v.getRole()
   895  	if v.tokenData.Root && role == "" {
   896  		req.Period = v.childTTL
   897  		secret, err = v.auth.Create(req)
   898  	} else {
   899  		// Make the token using the role
   900  		secret, err = v.auth.CreateWithRole(req, v.getRole())
   901  	}
   902  
   903  	// Determine whether it is unrecoverable
   904  	if err != nil {
   905  		if vaultUnrecoverableError.MatchString(err.Error()) {
   906  			return secret, err
   907  		}
   908  
   909  		// The error is recoverable
   910  		return nil, structs.NewRecoverableError(err, true)
   911  	}
   912  
   913  	return secret, nil
   914  }
   915  
   916  // LookupToken takes a Vault token and does a lookup against Vault. The call is
   917  // rate limited and may be canceled with passed context.
   918  func (v *vaultClient) LookupToken(ctx context.Context, token string) (*vapi.Secret, error) {
   919  	if !v.Enabled() {
   920  		return nil, fmt.Errorf("Vault integration disabled")
   921  	}
   922  
   923  	if !v.Active() {
   924  		return nil, fmt.Errorf("Vault client not active")
   925  	}
   926  
   927  	// Check if we have established a connection with Vault
   928  	if established, err := v.ConnectionEstablished(); !established && err == nil {
   929  		return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
   930  	} else if !established {
   931  		return nil, fmt.Errorf("Connection to Vault failed: %v", err)
   932  	}
   933  
   934  	// Track how long the request takes
   935  	defer metrics.MeasureSince([]string{"nomad", "vault", "lookup_token"}, time.Now())
   936  
   937  	// Ensure we are under our rate limit
   938  	if err := v.limiter.Wait(ctx); err != nil {
   939  		return nil, err
   940  	}
   941  
   942  	// Lookup the token
   943  	return v.auth.Lookup(token)
   944  }
   945  
   946  // PoliciesFrom parses the set of policies returned by a token lookup.
   947  func PoliciesFrom(s *vapi.Secret) ([]string, error) {
   948  	if s == nil {
   949  		return nil, fmt.Errorf("cannot parse nil Vault secret")
   950  	}
   951  	var data tokenData
   952  	if err := mapstructure.WeakDecode(s.Data, &data); err != nil {
   953  		return nil, fmt.Errorf("failed to parse Vault token's data block: %v", err)
   954  	}
   955  
   956  	return data.Policies, nil
   957  }
   958  
   959  // RevokeTokens revokes the passed set of accessors. If committed is set, the
   960  // purge function passed to the client is called. If there is an error purging
   961  // either because of Vault failures or because of the purge function, the
   962  // revocation is retried until the tokens TTL.
   963  func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error {
   964  	if !v.Enabled() {
   965  		return nil
   966  	}
   967  
   968  	if !v.Active() {
   969  		return fmt.Errorf("Vault client not active")
   970  	}
   971  
   972  	// Track how long the request takes
   973  	defer metrics.MeasureSince([]string{"nomad", "vault", "revoke_tokens"}, time.Now())
   974  
   975  	// Check if we have established a connection with Vault. If not just add it
   976  	// to the queue
   977  	if established, err := v.ConnectionEstablished(); !established && err == nil {
   978  		// Only bother tracking it for later revocation if the accessor was
   979  		// committed
   980  		if committed {
   981  			v.storeForRevocation(accessors)
   982  		}
   983  
   984  		// Track that we are abandoning these accessors.
   985  		metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors)))
   986  		return nil
   987  	}
   988  
   989  	// Attempt to revoke immediately and if it fails, add it to the revoke queue
   990  	err := v.parallelRevoke(ctx, accessors)
   991  	if err != nil {
   992  		// If it is uncommitted, it is a best effort revoke as it will shortly
   993  		// TTL within the cubbyhole and has not been leaked to any outside
   994  		// system
   995  		if !committed {
   996  			metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors)))
   997  			return nil
   998  		}
   999  
  1000  		v.logger.Printf("[WARN] vault: failed to revoke tokens. Will reattempt til TTL: %v", err)
  1001  		v.storeForRevocation(accessors)
  1002  		return nil
  1003  	} else if !committed {
  1004  		// Mark that it was revoked but there is nothing to purge so exit
  1005  		metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_revoked"}, float32(len(accessors)))
  1006  		return nil
  1007  	}
  1008  
  1009  	if err := v.purgeFn(accessors); err != nil {
  1010  		v.logger.Printf("[ERR] vault: failed to purge Vault accessors: %v", err)
  1011  		v.storeForRevocation(accessors)
  1012  		return nil
  1013  	}
  1014  
  1015  	// Track that it was revoked successfully
  1016  	metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(accessors)))
  1017  
  1018  	return nil
  1019  }
  1020  
  1021  // storeForRevocation stores the passed set of accessors for revocation. It
  1022  // captrues their effective TTL by storing their create TTL plus the current
  1023  // time.
  1024  func (v *vaultClient) storeForRevocation(accessors []*structs.VaultAccessor) {
  1025  	v.revLock.Lock()
  1026  	v.statsLock.Lock()
  1027  	now := time.Now()
  1028  	for _, a := range accessors {
  1029  		v.revoking[a] = now.Add(time.Duration(a.CreationTTL) * time.Second)
  1030  	}
  1031  	v.stats.TrackedForRevoke = len(v.revoking)
  1032  	v.statsLock.Unlock()
  1033  	v.revLock.Unlock()
  1034  }
  1035  
  1036  // parallelRevoke revokes the passed VaultAccessors in parallel.
  1037  func (v *vaultClient) parallelRevoke(ctx context.Context, accessors []*structs.VaultAccessor) error {
  1038  	if !v.Enabled() {
  1039  		return fmt.Errorf("Vault integration disabled")
  1040  	}
  1041  
  1042  	if !v.Active() {
  1043  		return fmt.Errorf("Vault client not active")
  1044  	}
  1045  
  1046  	// Check if we have established a connection with Vault
  1047  	if established, err := v.ConnectionEstablished(); !established && err == nil {
  1048  		return structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
  1049  	} else if !established {
  1050  		return fmt.Errorf("Connection to Vault failed: %v", err)
  1051  	}
  1052  
  1053  	g, pCtx := errgroup.WithContext(ctx)
  1054  
  1055  	// Cap the handlers
  1056  	handlers := len(accessors)
  1057  	if handlers > maxParallelRevokes {
  1058  		handlers = maxParallelRevokes
  1059  	}
  1060  
  1061  	// Create the Vault Tokens
  1062  	input := make(chan *structs.VaultAccessor, handlers)
  1063  	for i := 0; i < handlers; i++ {
  1064  		g.Go(func() error {
  1065  			for {
  1066  				select {
  1067  				case va, ok := <-input:
  1068  					if !ok {
  1069  						return nil
  1070  					}
  1071  
  1072  					if err := v.auth.RevokeAccessor(va.Accessor); err != nil {
  1073  						return fmt.Errorf("failed to revoke token (alloc: %q, node: %q, task: %q): %v", va.AllocID, va.NodeID, va.Task, err)
  1074  					}
  1075  				case <-pCtx.Done():
  1076  					return nil
  1077  				}
  1078  			}
  1079  		})
  1080  	}
  1081  
  1082  	// Send the input
  1083  	go func() {
  1084  		defer close(input)
  1085  		for _, va := range accessors {
  1086  			select {
  1087  			case <-pCtx.Done():
  1088  				return
  1089  			case input <- va:
  1090  			}
  1091  		}
  1092  
  1093  	}()
  1094  
  1095  	// Wait for everything to complete
  1096  	return g.Wait()
  1097  }
  1098  
  1099  // revokeDaemon should be called in a goroutine and is used to periodically
  1100  // revoke Vault accessors that failed the original revocation
  1101  func (v *vaultClient) revokeDaemon() {
  1102  	ticker := time.NewTicker(vaultRevocationIntv)
  1103  	defer ticker.Stop()
  1104  
  1105  	for {
  1106  		select {
  1107  		case <-v.tomb.Dying():
  1108  			return
  1109  		case now := <-ticker.C:
  1110  			if established, _ := v.ConnectionEstablished(); !established {
  1111  				continue
  1112  			}
  1113  
  1114  			v.revLock.Lock()
  1115  
  1116  			// Fast path
  1117  			if len(v.revoking) == 0 {
  1118  				v.revLock.Unlock()
  1119  				continue
  1120  			}
  1121  
  1122  			// Build the list of allocations that need to revoked while pruning any TTL'd checks
  1123  			revoking := make([]*structs.VaultAccessor, 0, len(v.revoking))
  1124  			for va, ttl := range v.revoking {
  1125  				if now.After(ttl) {
  1126  					delete(v.revoking, va)
  1127  				} else {
  1128  					revoking = append(revoking, va)
  1129  				}
  1130  			}
  1131  
  1132  			if err := v.parallelRevoke(context.Background(), revoking); err != nil {
  1133  				v.logger.Printf("[WARN] vault: background token revocation errored: %v", err)
  1134  				v.revLock.Unlock()
  1135  				continue
  1136  			}
  1137  
  1138  			// Unlock before a potentially expensive operation
  1139  			v.revLock.Unlock()
  1140  
  1141  			// Call the passed in token revocation function
  1142  			if err := v.purgeFn(revoking); err != nil {
  1143  				// Can continue since revocation is idempotent
  1144  				v.logger.Printf("[ERR] vault: token revocation errored: %v", err)
  1145  				continue
  1146  			}
  1147  
  1148  			// Track that tokens were revoked successfully
  1149  			metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(revoking)))
  1150  
  1151  			// Can delete from the tracked list now that we have purged
  1152  			v.revLock.Lock()
  1153  			v.statsLock.Lock()
  1154  			for _, va := range revoking {
  1155  				delete(v.revoking, va)
  1156  			}
  1157  			v.stats.TrackedForRevoke = len(v.revoking)
  1158  			v.statsLock.Unlock()
  1159  			v.revLock.Unlock()
  1160  
  1161  		}
  1162  	}
  1163  }
  1164  
  1165  // purgeVaultAccessors creates a Raft transaction to remove the passed Vault
  1166  // Accessors
  1167  func (s *Server) purgeVaultAccessors(accessors []*structs.VaultAccessor) error {
  1168  	// Commit this update via Raft
  1169  	req := structs.VaultAccessorsRequest{Accessors: accessors}
  1170  	_, _, err := s.raftApply(structs.VaultAccessorDegisterRequestType, req)
  1171  	return err
  1172  }
  1173  
  1174  // wrapNilError is a helper that returns a wrapped function that returns a nil
  1175  // error
  1176  func wrapNilError(f func()) func() error {
  1177  	return func() error {
  1178  		f()
  1179  		return nil
  1180  	}
  1181  }
  1182  
  1183  // setLimit is used to update the rate limit
  1184  func (v *vaultClient) setLimit(l rate.Limit) {
  1185  	v.l.Lock()
  1186  	defer v.l.Unlock()
  1187  	v.limiter = rate.NewLimiter(l, int(l))
  1188  }
  1189  
  1190  // Stats is used to query the state of the blocked eval tracker.
  1191  func (v *vaultClient) Stats() *VaultStats {
  1192  	// Allocate a new stats struct
  1193  	stats := new(VaultStats)
  1194  
  1195  	v.statsLock.RLock()
  1196  	defer v.statsLock.RUnlock()
  1197  
  1198  	// Copy all the stats
  1199  	stats.TrackedForRevoke = v.stats.TrackedForRevoke
  1200  
  1201  	return stats
  1202  }
  1203  
  1204  // EmitStats is used to export metrics about the blocked eval tracker while enabled
  1205  func (v *vaultClient) EmitStats(period time.Duration, stopCh chan struct{}) {
  1206  	for {
  1207  		select {
  1208  		case <-time.After(period):
  1209  			stats := v.Stats()
  1210  			metrics.SetGauge([]string{"nomad", "vault", "distributed_tokens_revoking"}, float32(stats.TrackedForRevoke))
  1211  		case <-stopCh:
  1212  			return
  1213  		}
  1214  	}
  1215  }