gopkg.in/hashicorp/nomad.v0@v0.11.8/nomad/consul.go (about)

     1  package nomad
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/armon/go-metrics"
    11  	"github.com/hashicorp/consul/api"
    12  	"github.com/hashicorp/go-hclog"
    13  	"github.com/hashicorp/nomad/command/agent/consul"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  	"github.com/pkg/errors"
    16  	"golang.org/x/sync/errgroup"
    17  	"golang.org/x/time/rate"
    18  )
    19  
    20  const (
    21  	// siTokenDescriptionFmt is the format for the .Description field of
    22  	// service identity tokens generated on behalf of Nomad.
    23  	siTokenDescriptionFmt = "_nomad_si [%s] [%s] [%s]"
    24  
    25  	// siTokenRequestRateLimit is the maximum number of requests per second Nomad
    26  	// will make against Consul for requesting SI tokens.
    27  	siTokenRequestRateLimit rate.Limit = 500
    28  
    29  	// siTokenMaxParallelRevokes is the maximum number of parallel SI token
    30  	// revocation requests Nomad will make against Consul.
    31  	siTokenMaxParallelRevokes = 64
    32  
    33  	// siTokenRevocationInterval is the interval at which SI tokens that failed
    34  	// initial revocation are retried.
    35  	siTokenRevocationInterval = 5 * time.Minute
    36  )
    37  
    38  const (
    39  	// ConsulPolicyWrite is the literal text of the policy field of a Consul Policy
    40  	// Rule that we check when validating an Operator Consul token against the
    41  	// necessary permissions for creating a Service Identity token for a given
    42  	// service.
    43  	//
    44  	// The rule may be:
    45  	//  - service.<exact>
    46  	//  - service."*" (wildcard)
    47  	//  - service_prefix.<matching> (including empty string)
    48  	//
    49  	// e.g.
    50  	//   service "web" { policy = "write" }
    51  	//   service_prefix "" { policy = "write" }
    52  	ConsulPolicyWrite = "write"
    53  )
    54  
    55  type ServiceIdentityRequest struct {
    56  	TaskKind  structs.TaskKind
    57  	TaskName  string
    58  	ClusterID string
    59  	AllocID   string
    60  }
    61  
    62  func (sir ServiceIdentityRequest) Validate() error {
    63  	switch {
    64  	case sir.ClusterID == "":
    65  		return errors.New("cluster id not set")
    66  	case sir.AllocID == "":
    67  		return errors.New("alloc id not set")
    68  	case sir.TaskName == "":
    69  		return errors.New("task name not set")
    70  	case sir.TaskKind == "":
    71  		return errors.New("task kind not set")
    72  	default:
    73  		return nil
    74  	}
    75  }
    76  
    77  func (sir ServiceIdentityRequest) Description() string {
    78  	return fmt.Sprintf(siTokenDescriptionFmt, sir.ClusterID, sir.AllocID, sir.TaskName)
    79  }
    80  
    81  // ConsulACLsAPI is an abstraction over the consul/api.ACL API used by Nomad
    82  // Server.
    83  //
    84  // ACL requirements
    85  // - acl:write (transitive through ACLsAPI)
    86  type ConsulACLsAPI interface {
    87  
    88  	// CheckSIPolicy checks that the given operator token has the equivalent ACL
    89  	// permissiveness that a Service Identity token policy for task would have.
    90  	CheckSIPolicy(ctx context.Context, task, secretID string) error
    91  
    92  	// Create instructs Consul to create a Service Identity token.
    93  	CreateToken(context.Context, ServiceIdentityRequest) (*structs.SIToken, error)
    94  
    95  	// RevokeTokens instructs Consul to revoke the given token accessors.
    96  	RevokeTokens(context.Context, []*structs.SITokenAccessor, bool) bool
    97  
    98  	// MarkForRevocation marks the tokens for background revocation
    99  	MarkForRevocation([]*structs.SITokenAccessor)
   100  
   101  	// Stop is used to stop background token revocations. Intended to be used
   102  	// on Nomad Server shutdown.
   103  	Stop()
   104  
   105  	// todo(shoenig): use list endpoint for finding orphaned tokens
   106  	// ListTokens lists every token in Consul.
   107  	// ListTokens() ([]string, error)
   108  }
   109  
   110  // PurgeSITokenAccessorFunc is called to remove SI Token accessors from the
   111  // system (i.e. raft). If the function returns an error, the token will still
   112  // be tracked and revocation attempts will retry in the background until there
   113  // is a success.
   114  type PurgeSITokenAccessorFunc func([]*structs.SITokenAccessor) error
   115  
   116  type SITokenStats struct {
   117  	TrackedForRevoke int
   118  }
   119  
   120  type consulACLsAPI struct {
   121  	// aclClient is the API subset of the real consul client we need for
   122  	// managing Service Identity tokens
   123  	aclClient consul.ACLsAPI
   124  
   125  	// limiter is used to rate limit requests to consul
   126  	limiter *rate.Limiter
   127  
   128  	bgRevokeLock sync.Mutex
   129  	// Track accessors that must have their revocation retried in the background.
   130  	bgRetryRevocation []*structs.SITokenAccessor
   131  	// Track whether the background revocations have been stopped, to avoid
   132  	// creating tokens we would no longer be able to revoke. Expected to be used
   133  	// on a Server shutdown.
   134  	bgRevokeStopped bool
   135  
   136  	// purgeFunc is the Nomad Server function that removes the reference to the
   137  	// SI token accessor from the persistent raft store
   138  	purgeFunc PurgeSITokenAccessorFunc
   139  
   140  	// stopC is used to signal the client is shutting down and token revocation
   141  	// background goroutine should stop
   142  	stopC chan struct{}
   143  
   144  	// logger is used to log messages
   145  	logger hclog.Logger
   146  }
   147  
   148  func NewConsulACLsAPI(aclClient consul.ACLsAPI, logger hclog.Logger, purgeFunc PurgeSITokenAccessorFunc) *consulACLsAPI {
   149  	if purgeFunc == nil {
   150  		purgeFunc = func([]*structs.SITokenAccessor) error { return nil }
   151  	}
   152  
   153  	c := &consulACLsAPI{
   154  		aclClient: aclClient,
   155  		limiter:   rate.NewLimiter(siTokenRequestRateLimit, int(siTokenRequestRateLimit)),
   156  		stopC:     make(chan struct{}),
   157  		purgeFunc: purgeFunc,
   158  		logger:    logger.Named("consul_acl"),
   159  	}
   160  
   161  	go c.bgRetryRevokeDaemon()
   162  
   163  	return c
   164  }
   165  
   166  // Stop stops background token revocations from happening. Once stopped, tokens
   167  // may no longer be created.
   168  func (c *consulACLsAPI) Stop() {
   169  	c.bgRevokeLock.Lock()
   170  	defer c.bgRevokeLock.Unlock()
   171  
   172  	c.stopC <- struct{}{}
   173  	c.bgRevokeStopped = true
   174  }
   175  
   176  func (c *consulACLsAPI) CheckSIPolicy(ctx context.Context, task, secretID string) error {
   177  	defer metrics.MeasureSince([]string{"nomad", "consul", "check_si_policy"}, time.Now())
   178  
   179  	if id := strings.TrimSpace(secretID); id == "" {
   180  		return errors.New("missing consul token")
   181  	}
   182  
   183  	// Ensure we are under our rate limit.
   184  	if err := c.limiter.Wait(ctx); err != nil {
   185  		return err
   186  	}
   187  
   188  	opToken, _, err := c.aclClient.TokenReadSelf(&api.QueryOptions{
   189  		AllowStale: false,
   190  		Token:      secretID,
   191  	})
   192  	if err != nil {
   193  		return errors.Wrap(err, "unable to validate operator consul token")
   194  	}
   195  
   196  	allowable, err := c.hasSufficientPolicy(task, opToken)
   197  	if err != nil {
   198  		return errors.Wrap(err, "unable to validate operator consul token")
   199  	}
   200  	if !allowable {
   201  		return errors.Errorf("permission denied for %q", task)
   202  	}
   203  
   204  	return nil
   205  }
   206  
   207  func (c *consulACLsAPI) CreateToken(ctx context.Context, sir ServiceIdentityRequest) (*structs.SIToken, error) {
   208  	defer metrics.MeasureSince([]string{"nomad", "consul", "create_token"}, time.Now())
   209  
   210  	// make sure the background token revocations have not been stopped
   211  	c.bgRevokeLock.Lock()
   212  	stopped := c.bgRevokeStopped
   213  	c.bgRevokeLock.Unlock()
   214  
   215  	if stopped {
   216  		return nil, errors.New("client stopped and may no longer create tokens")
   217  	}
   218  
   219  	// sanity check the metadata for the token we want
   220  	if err := sir.Validate(); err != nil {
   221  		return nil, err
   222  	}
   223  
   224  	// the SI token created must be for the service, not the sidecar of the service
   225  	// https://www.consul.io/docs/acl/acl-system.html#acl-service-identities
   226  	service := sir.TaskKind.Value()
   227  	partial := &api.ACLToken{
   228  		Description:       sir.Description(),
   229  		ServiceIdentities: []*api.ACLServiceIdentity{{ServiceName: service}},
   230  	}
   231  
   232  	// Ensure we are under our rate limit.
   233  	if err := c.limiter.Wait(ctx); err != nil {
   234  		return nil, err
   235  	}
   236  
   237  	token, _, err := c.aclClient.TokenCreate(partial, nil)
   238  	if err != nil {
   239  		return nil, err
   240  	}
   241  
   242  	return &structs.SIToken{
   243  		TaskName:   sir.TaskName,
   244  		AccessorID: token.AccessorID,
   245  		SecretID:   token.SecretID,
   246  	}, nil
   247  }
   248  
   249  // RevokeTokens revokes the passed set of SI token accessors. If committed is set,
   250  // the client's purge function is called (which purges the tokens from the Server's
   251  // persistent store). If there is an error purging either because of Consul failures
   252  // or because of the purge function, the revocation is retried in the background.
   253  //
   254  // The revocation of an SI token accessor is idempotent.
   255  //
   256  // A return value of true indicates one or more accessors were stored for
   257  // a revocation retry attempt in the background (intended for tests).
   258  func (c *consulACLsAPI) RevokeTokens(ctx context.Context, accessors []*structs.SITokenAccessor, committed bool) bool {
   259  	defer metrics.MeasureSince([]string{"nomad", "consul", "revoke_tokens"}, time.Now())
   260  
   261  	nTokens := float32(len(accessors))
   262  
   263  	if err := c.parallelRevoke(ctx, accessors); err != nil {
   264  		// If these tokens were uncommitted into raft, it is a best effort to
   265  		// revoke them now. If this immediate revocation does not work, Nomad loses
   266  		// track of them and will need to do a brute reconciliation later. This
   267  		// should happen rarely, and will be implemented soon.
   268  		if !committed {
   269  			metrics.IncrCounter([]string{"nomad", "consul", "undistributed_si_tokens_abandoned"}, nTokens)
   270  		}
   271  
   272  		c.logger.Warn("failed to revoke tokens, will reattempt later", "error", err)
   273  		c.storeForRevocation(accessors)
   274  		return true
   275  	}
   276  
   277  	if !committed {
   278  		// Un-committed tokens were revoked without incident (nothing to purge)
   279  		metrics.IncrCounter([]string{"nomad", "consul", "undistributed_si_tokens_revoked"}, nTokens)
   280  		return false
   281  	}
   282  
   283  	// Committed tokens were revoked without incident, now purge them
   284  	if err := c.purgeFunc(accessors); err != nil {
   285  		c.logger.Error("failed to purge SI token accessors", "error", err)
   286  		c.storeForRevocation(accessors)
   287  		return true
   288  	}
   289  
   290  	// Track that the SI tokens were revoked and purged successfully
   291  	metrics.IncrCounter([]string{"nomad", "consul", "distributed_si_tokens_revoked"}, nTokens)
   292  	return false
   293  }
   294  
   295  func (c *consulACLsAPI) MarkForRevocation(accessors []*structs.SITokenAccessor) {
   296  	c.storeForRevocation(accessors)
   297  }
   298  
   299  func (c *consulACLsAPI) storeForRevocation(accessors []*structs.SITokenAccessor) {
   300  	c.bgRevokeLock.Lock()
   301  	defer c.bgRevokeLock.Unlock()
   302  
   303  	// copy / append the set of accessors we must track for revocation in the
   304  	// background
   305  	c.bgRetryRevocation = append(c.bgRetryRevocation, accessors...)
   306  }
   307  
   308  func (c *consulACLsAPI) parallelRevoke(ctx context.Context, accessors []*structs.SITokenAccessor) error {
   309  	g, pCtx := errgroup.WithContext(ctx)
   310  
   311  	// Cap the handlers
   312  	handlers := len(accessors)
   313  	if handlers > siTokenMaxParallelRevokes {
   314  		handlers = siTokenMaxParallelRevokes
   315  	}
   316  
   317  	// Revoke the SI Token Accessors
   318  	input := make(chan *structs.SITokenAccessor, handlers)
   319  	for i := 0; i < handlers; i++ {
   320  		g.Go(func() error {
   321  			for {
   322  				select {
   323  				case accessor, ok := <-input:
   324  					if !ok {
   325  						return nil
   326  					}
   327  					if err := c.singleRevoke(ctx, accessor); err != nil {
   328  						return errors.Wrapf(err,
   329  							"failed to revoke SI token accessor (alloc %q, node %q, task %q)",
   330  							accessor.AllocID, accessor.NodeID, accessor.TaskName,
   331  						)
   332  					}
   333  				case <-pCtx.Done():
   334  					return nil
   335  				}
   336  			}
   337  		})
   338  	}
   339  
   340  	// Send the input
   341  	go func() {
   342  		defer close(input)
   343  		for _, accessor := range accessors {
   344  			select {
   345  			case <-pCtx.Done():
   346  				return
   347  			case input <- accessor:
   348  			}
   349  		}
   350  	}()
   351  
   352  	// Wait for everything to complete
   353  	return g.Wait()
   354  }
   355  
   356  func (c *consulACLsAPI) singleRevoke(ctx context.Context, accessor *structs.SITokenAccessor) error {
   357  	c.logger.Trace("revoke SI token", "task", accessor.TaskName, "alloc_id", accessor.AllocID, "node_id", accessor.NodeID)
   358  
   359  	// Ensure we are under our rate limit.
   360  	if err := c.limiter.Wait(ctx); err != nil {
   361  		return err
   362  	}
   363  
   364  	// Consul will no-op the deletion of a non-existent token (no error)
   365  	_, err := c.aclClient.TokenDelete(accessor.AccessorID, nil)
   366  	return err
   367  }
   368  
   369  func (c *consulACLsAPI) bgRetryRevokeDaemon() {
   370  	ticker := time.NewTicker(siTokenRevocationInterval)
   371  	defer ticker.Stop()
   372  
   373  	for {
   374  		select {
   375  		case <-c.stopC:
   376  			return
   377  		case <-ticker.C:
   378  			c.bgRetryRevoke()
   379  		}
   380  	}
   381  }
   382  
   383  // maxConsulRevocationBatchSize is the maximum tokens a bgRetryRevoke should revoke
   384  // at any given time.
   385  const maxConsulRevocationBatchSize = 1000
   386  
   387  func (c *consulACLsAPI) bgRetryRevoke() {
   388  	c.bgRevokeLock.Lock()
   389  	defer c.bgRevokeLock.Unlock()
   390  
   391  	// fast path, nothing to do
   392  	if len(c.bgRetryRevocation) == 0 {
   393  		return
   394  	}
   395  
   396  	// unlike vault tokens, SI tokens do not have a TTL, and so we must try to
   397  	// remove all SI token accessors, every time, until they're gone
   398  	toRevoke := len(c.bgRetryRevocation)
   399  	if toRevoke > maxConsulRevocationBatchSize {
   400  		toRevoke = maxConsulRevocationBatchSize
   401  	}
   402  	toPurge := make([]*structs.SITokenAccessor, toRevoke)
   403  	copy(toPurge, c.bgRetryRevocation)
   404  
   405  	if err := c.parallelRevoke(context.Background(), toPurge); err != nil {
   406  		c.logger.Warn("background SI token revocation failed", "error", err)
   407  		return
   408  	}
   409  
   410  	// Call the revocation function
   411  	if err := c.purgeFunc(toPurge); err != nil {
   412  		// Just try again later (revocation is idempotent)
   413  		c.logger.Error("background SI token purge failed", "error", err)
   414  		return
   415  	}
   416  
   417  	// Track that the SI tokens were revoked successfully
   418  	nTokens := float32(len(toPurge))
   419  	metrics.IncrCounter([]string{"nomad", "consul", "distributed_tokens_revoked"}, nTokens)
   420  
   421  	// Reset the list of accessors to retry, since we just removed them all.
   422  	c.bgRetryRevocation = nil
   423  }
   424  
   425  func (c *consulACLsAPI) ListTokens() ([]string, error) {
   426  	// defer metrics.MeasureSince([]string{"nomad", "consul", "list_tokens"}, time.Now())
   427  	return nil, errors.New("not yet implemented")
   428  }
   429  
   430  // purgeSITokenAccessors is the Nomad Server method which will remove the set
   431  // of SI token accessors from the persistent raft store.
   432  func (s *Server) purgeSITokenAccessors(accessors []*structs.SITokenAccessor) error {
   433  	// Commit this update via Raft
   434  	request := structs.SITokenAccessorsRequest{Accessors: accessors}
   435  	_, _, err := s.raftApply(structs.ServiceIdentityAccessorDeregisterRequestType, request)
   436  	return err
   437  }