gopkg.in/hashicorp/nomad.v0@v0.11.8/nomad/consul.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/armon/go-metrics" 11 "github.com/hashicorp/consul/api" 12 "github.com/hashicorp/go-hclog" 13 "github.com/hashicorp/nomad/command/agent/consul" 14 "github.com/hashicorp/nomad/nomad/structs" 15 "github.com/pkg/errors" 16 "golang.org/x/sync/errgroup" 17 "golang.org/x/time/rate" 18 ) 19 20 const ( 21 // siTokenDescriptionFmt is the format for the .Description field of 22 // service identity tokens generated on behalf of Nomad. 23 siTokenDescriptionFmt = "_nomad_si [%s] [%s] [%s]" 24 25 // siTokenRequestRateLimit is the maximum number of requests per second Nomad 26 // will make against Consul for requesting SI tokens. 27 siTokenRequestRateLimit rate.Limit = 500 28 29 // siTokenMaxParallelRevokes is the maximum number of parallel SI token 30 // revocation requests Nomad will make against Consul. 31 siTokenMaxParallelRevokes = 64 32 33 // siTokenRevocationInterval is the interval at which SI tokens that failed 34 // initial revocation are retried. 35 siTokenRevocationInterval = 5 * time.Minute 36 ) 37 38 const ( 39 // ConsulPolicyWrite is the literal text of the policy field of a Consul Policy 40 // Rule that we check when validating an Operator Consul token against the 41 // necessary permissions for creating a Service Identity token for a given 42 // service. 43 // 44 // The rule may be: 45 // - service.<exact> 46 // - service."*" (wildcard) 47 // - service_prefix.<matching> (including empty string) 48 // 49 // e.g. 50 // service "web" { policy = "write" } 51 // service_prefix "" { policy = "write" } 52 ConsulPolicyWrite = "write" 53 ) 54 55 type ServiceIdentityRequest struct { 56 TaskKind structs.TaskKind 57 TaskName string 58 ClusterID string 59 AllocID string 60 } 61 62 func (sir ServiceIdentityRequest) Validate() error { 63 switch { 64 case sir.ClusterID == "": 65 return errors.New("cluster id not set") 66 case sir.AllocID == "": 67 return errors.New("alloc id not set") 68 case sir.TaskName == "": 69 return errors.New("task name not set") 70 case sir.TaskKind == "": 71 return errors.New("task kind not set") 72 default: 73 return nil 74 } 75 } 76 77 func (sir ServiceIdentityRequest) Description() string { 78 return fmt.Sprintf(siTokenDescriptionFmt, sir.ClusterID, sir.AllocID, sir.TaskName) 79 } 80 81 // ConsulACLsAPI is an abstraction over the consul/api.ACL API used by Nomad 82 // Server. 83 // 84 // ACL requirements 85 // - acl:write (transitive through ACLsAPI) 86 type ConsulACLsAPI interface { 87 88 // CheckSIPolicy checks that the given operator token has the equivalent ACL 89 // permissiveness that a Service Identity token policy for task would have. 90 CheckSIPolicy(ctx context.Context, task, secretID string) error 91 92 // Create instructs Consul to create a Service Identity token. 93 CreateToken(context.Context, ServiceIdentityRequest) (*structs.SIToken, error) 94 95 // RevokeTokens instructs Consul to revoke the given token accessors. 96 RevokeTokens(context.Context, []*structs.SITokenAccessor, bool) bool 97 98 // MarkForRevocation marks the tokens for background revocation 99 MarkForRevocation([]*structs.SITokenAccessor) 100 101 // Stop is used to stop background token revocations. Intended to be used 102 // on Nomad Server shutdown. 103 Stop() 104 105 // todo(shoenig): use list endpoint for finding orphaned tokens 106 // ListTokens lists every token in Consul. 107 // ListTokens() ([]string, error) 108 } 109 110 // PurgeSITokenAccessorFunc is called to remove SI Token accessors from the 111 // system (i.e. raft). If the function returns an error, the token will still 112 // be tracked and revocation attempts will retry in the background until there 113 // is a success. 114 type PurgeSITokenAccessorFunc func([]*structs.SITokenAccessor) error 115 116 type SITokenStats struct { 117 TrackedForRevoke int 118 } 119 120 type consulACLsAPI struct { 121 // aclClient is the API subset of the real consul client we need for 122 // managing Service Identity tokens 123 aclClient consul.ACLsAPI 124 125 // limiter is used to rate limit requests to consul 126 limiter *rate.Limiter 127 128 bgRevokeLock sync.Mutex 129 // Track accessors that must have their revocation retried in the background. 130 bgRetryRevocation []*structs.SITokenAccessor 131 // Track whether the background revocations have been stopped, to avoid 132 // creating tokens we would no longer be able to revoke. Expected to be used 133 // on a Server shutdown. 134 bgRevokeStopped bool 135 136 // purgeFunc is the Nomad Server function that removes the reference to the 137 // SI token accessor from the persistent raft store 138 purgeFunc PurgeSITokenAccessorFunc 139 140 // stopC is used to signal the client is shutting down and token revocation 141 // background goroutine should stop 142 stopC chan struct{} 143 144 // logger is used to log messages 145 logger hclog.Logger 146 } 147 148 func NewConsulACLsAPI(aclClient consul.ACLsAPI, logger hclog.Logger, purgeFunc PurgeSITokenAccessorFunc) *consulACLsAPI { 149 if purgeFunc == nil { 150 purgeFunc = func([]*structs.SITokenAccessor) error { return nil } 151 } 152 153 c := &consulACLsAPI{ 154 aclClient: aclClient, 155 limiter: rate.NewLimiter(siTokenRequestRateLimit, int(siTokenRequestRateLimit)), 156 stopC: make(chan struct{}), 157 purgeFunc: purgeFunc, 158 logger: logger.Named("consul_acl"), 159 } 160 161 go c.bgRetryRevokeDaemon() 162 163 return c 164 } 165 166 // Stop stops background token revocations from happening. Once stopped, tokens 167 // may no longer be created. 168 func (c *consulACLsAPI) Stop() { 169 c.bgRevokeLock.Lock() 170 defer c.bgRevokeLock.Unlock() 171 172 c.stopC <- struct{}{} 173 c.bgRevokeStopped = true 174 } 175 176 func (c *consulACLsAPI) CheckSIPolicy(ctx context.Context, task, secretID string) error { 177 defer metrics.MeasureSince([]string{"nomad", "consul", "check_si_policy"}, time.Now()) 178 179 if id := strings.TrimSpace(secretID); id == "" { 180 return errors.New("missing consul token") 181 } 182 183 // Ensure we are under our rate limit. 184 if err := c.limiter.Wait(ctx); err != nil { 185 return err 186 } 187 188 opToken, _, err := c.aclClient.TokenReadSelf(&api.QueryOptions{ 189 AllowStale: false, 190 Token: secretID, 191 }) 192 if err != nil { 193 return errors.Wrap(err, "unable to validate operator consul token") 194 } 195 196 allowable, err := c.hasSufficientPolicy(task, opToken) 197 if err != nil { 198 return errors.Wrap(err, "unable to validate operator consul token") 199 } 200 if !allowable { 201 return errors.Errorf("permission denied for %q", task) 202 } 203 204 return nil 205 } 206 207 func (c *consulACLsAPI) CreateToken(ctx context.Context, sir ServiceIdentityRequest) (*structs.SIToken, error) { 208 defer metrics.MeasureSince([]string{"nomad", "consul", "create_token"}, time.Now()) 209 210 // make sure the background token revocations have not been stopped 211 c.bgRevokeLock.Lock() 212 stopped := c.bgRevokeStopped 213 c.bgRevokeLock.Unlock() 214 215 if stopped { 216 return nil, errors.New("client stopped and may no longer create tokens") 217 } 218 219 // sanity check the metadata for the token we want 220 if err := sir.Validate(); err != nil { 221 return nil, err 222 } 223 224 // the SI token created must be for the service, not the sidecar of the service 225 // https://www.consul.io/docs/acl/acl-system.html#acl-service-identities 226 service := sir.TaskKind.Value() 227 partial := &api.ACLToken{ 228 Description: sir.Description(), 229 ServiceIdentities: []*api.ACLServiceIdentity{{ServiceName: service}}, 230 } 231 232 // Ensure we are under our rate limit. 233 if err := c.limiter.Wait(ctx); err != nil { 234 return nil, err 235 } 236 237 token, _, err := c.aclClient.TokenCreate(partial, nil) 238 if err != nil { 239 return nil, err 240 } 241 242 return &structs.SIToken{ 243 TaskName: sir.TaskName, 244 AccessorID: token.AccessorID, 245 SecretID: token.SecretID, 246 }, nil 247 } 248 249 // RevokeTokens revokes the passed set of SI token accessors. If committed is set, 250 // the client's purge function is called (which purges the tokens from the Server's 251 // persistent store). If there is an error purging either because of Consul failures 252 // or because of the purge function, the revocation is retried in the background. 253 // 254 // The revocation of an SI token accessor is idempotent. 255 // 256 // A return value of true indicates one or more accessors were stored for 257 // a revocation retry attempt in the background (intended for tests). 258 func (c *consulACLsAPI) RevokeTokens(ctx context.Context, accessors []*structs.SITokenAccessor, committed bool) bool { 259 defer metrics.MeasureSince([]string{"nomad", "consul", "revoke_tokens"}, time.Now()) 260 261 nTokens := float32(len(accessors)) 262 263 if err := c.parallelRevoke(ctx, accessors); err != nil { 264 // If these tokens were uncommitted into raft, it is a best effort to 265 // revoke them now. If this immediate revocation does not work, Nomad loses 266 // track of them and will need to do a brute reconciliation later. This 267 // should happen rarely, and will be implemented soon. 268 if !committed { 269 metrics.IncrCounter([]string{"nomad", "consul", "undistributed_si_tokens_abandoned"}, nTokens) 270 } 271 272 c.logger.Warn("failed to revoke tokens, will reattempt later", "error", err) 273 c.storeForRevocation(accessors) 274 return true 275 } 276 277 if !committed { 278 // Un-committed tokens were revoked without incident (nothing to purge) 279 metrics.IncrCounter([]string{"nomad", "consul", "undistributed_si_tokens_revoked"}, nTokens) 280 return false 281 } 282 283 // Committed tokens were revoked without incident, now purge them 284 if err := c.purgeFunc(accessors); err != nil { 285 c.logger.Error("failed to purge SI token accessors", "error", err) 286 c.storeForRevocation(accessors) 287 return true 288 } 289 290 // Track that the SI tokens were revoked and purged successfully 291 metrics.IncrCounter([]string{"nomad", "consul", "distributed_si_tokens_revoked"}, nTokens) 292 return false 293 } 294 295 func (c *consulACLsAPI) MarkForRevocation(accessors []*structs.SITokenAccessor) { 296 c.storeForRevocation(accessors) 297 } 298 299 func (c *consulACLsAPI) storeForRevocation(accessors []*structs.SITokenAccessor) { 300 c.bgRevokeLock.Lock() 301 defer c.bgRevokeLock.Unlock() 302 303 // copy / append the set of accessors we must track for revocation in the 304 // background 305 c.bgRetryRevocation = append(c.bgRetryRevocation, accessors...) 306 } 307 308 func (c *consulACLsAPI) parallelRevoke(ctx context.Context, accessors []*structs.SITokenAccessor) error { 309 g, pCtx := errgroup.WithContext(ctx) 310 311 // Cap the handlers 312 handlers := len(accessors) 313 if handlers > siTokenMaxParallelRevokes { 314 handlers = siTokenMaxParallelRevokes 315 } 316 317 // Revoke the SI Token Accessors 318 input := make(chan *structs.SITokenAccessor, handlers) 319 for i := 0; i < handlers; i++ { 320 g.Go(func() error { 321 for { 322 select { 323 case accessor, ok := <-input: 324 if !ok { 325 return nil 326 } 327 if err := c.singleRevoke(ctx, accessor); err != nil { 328 return errors.Wrapf(err, 329 "failed to revoke SI token accessor (alloc %q, node %q, task %q)", 330 accessor.AllocID, accessor.NodeID, accessor.TaskName, 331 ) 332 } 333 case <-pCtx.Done(): 334 return nil 335 } 336 } 337 }) 338 } 339 340 // Send the input 341 go func() { 342 defer close(input) 343 for _, accessor := range accessors { 344 select { 345 case <-pCtx.Done(): 346 return 347 case input <- accessor: 348 } 349 } 350 }() 351 352 // Wait for everything to complete 353 return g.Wait() 354 } 355 356 func (c *consulACLsAPI) singleRevoke(ctx context.Context, accessor *structs.SITokenAccessor) error { 357 c.logger.Trace("revoke SI token", "task", accessor.TaskName, "alloc_id", accessor.AllocID, "node_id", accessor.NodeID) 358 359 // Ensure we are under our rate limit. 360 if err := c.limiter.Wait(ctx); err != nil { 361 return err 362 } 363 364 // Consul will no-op the deletion of a non-existent token (no error) 365 _, err := c.aclClient.TokenDelete(accessor.AccessorID, nil) 366 return err 367 } 368 369 func (c *consulACLsAPI) bgRetryRevokeDaemon() { 370 ticker := time.NewTicker(siTokenRevocationInterval) 371 defer ticker.Stop() 372 373 for { 374 select { 375 case <-c.stopC: 376 return 377 case <-ticker.C: 378 c.bgRetryRevoke() 379 } 380 } 381 } 382 383 // maxConsulRevocationBatchSize is the maximum tokens a bgRetryRevoke should revoke 384 // at any given time. 385 const maxConsulRevocationBatchSize = 1000 386 387 func (c *consulACLsAPI) bgRetryRevoke() { 388 c.bgRevokeLock.Lock() 389 defer c.bgRevokeLock.Unlock() 390 391 // fast path, nothing to do 392 if len(c.bgRetryRevocation) == 0 { 393 return 394 } 395 396 // unlike vault tokens, SI tokens do not have a TTL, and so we must try to 397 // remove all SI token accessors, every time, until they're gone 398 toRevoke := len(c.bgRetryRevocation) 399 if toRevoke > maxConsulRevocationBatchSize { 400 toRevoke = maxConsulRevocationBatchSize 401 } 402 toPurge := make([]*structs.SITokenAccessor, toRevoke) 403 copy(toPurge, c.bgRetryRevocation) 404 405 if err := c.parallelRevoke(context.Background(), toPurge); err != nil { 406 c.logger.Warn("background SI token revocation failed", "error", err) 407 return 408 } 409 410 // Call the revocation function 411 if err := c.purgeFunc(toPurge); err != nil { 412 // Just try again later (revocation is idempotent) 413 c.logger.Error("background SI token purge failed", "error", err) 414 return 415 } 416 417 // Track that the SI tokens were revoked successfully 418 nTokens := float32(len(toPurge)) 419 metrics.IncrCounter([]string{"nomad", "consul", "distributed_tokens_revoked"}, nTokens) 420 421 // Reset the list of accessors to retry, since we just removed them all. 422 c.bgRetryRevocation = nil 423 } 424 425 func (c *consulACLsAPI) ListTokens() ([]string, error) { 426 // defer metrics.MeasureSince([]string{"nomad", "consul", "list_tokens"}, time.Now()) 427 return nil, errors.New("not yet implemented") 428 } 429 430 // purgeSITokenAccessors is the Nomad Server method which will remove the set 431 // of SI token accessors from the persistent raft store. 432 func (s *Server) purgeSITokenAccessors(accessors []*structs.SITokenAccessor) error { 433 // Commit this update via Raft 434 request := structs.SITokenAccessorsRequest{Accessors: accessors} 435 _, _, err := s.raftApply(structs.ServiceIdentityAccessorDeregisterRequestType, request) 436 return err 437 }